fix opencl winograd OH/OW odd bug

pull/6220/head
Pengyongrong 5 years ago
parent 0a9938ddc8
commit 0b4c480474

@ -54,6 +54,62 @@ __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
}
__kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = (FLT4)(0.f);
__global float *src_addr = (__global float *)src_data;
__global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
__global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
__global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
if ((Z + 1) * 4 <= shape.w) {
data = TO_FLT4(((__global float4 *)src_addr_0)[0]);
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = src_addr_0[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = src_addr_1[0];
}
if ((shape.w - Z * 4) >= 3) {
data.z = src_addr_2[0];
}
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
}
__kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = (FLT4)(0.f);
__global half *src_addr = (__global half *)src_data;
__global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
__global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
__global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
if ((Z + 1) * 4 <= shape.w) {
data = TO_FLT4(((__global half4 *)src_addr_0)[0]);
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = src_addr_0[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = src_addr_1[0];
}
if ((shape.w - Z * 4) >= 3) {
data.z = src_addr_2[0];
}
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
}
__kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
@ -198,6 +254,64 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
}
}
}
__kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global float *dst_addr = (__global float *)dst_data;
__global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
__global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
__global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global float4 *)dst_addr_0)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr_0[0] = data.x;
}
if (shape.w - Z * 4 >= 2) {
dst_addr_1[0] = data.y;
}
if (shape.w - Z * 4 >= 3) {
dst_addr_2[0] = data.z;
}
}
}
__kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global half *dst_addr = (__global half *)dst_data;
__global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
__global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
__global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global half4 *)dst_addr_0)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr_0[0] = data.x;
}
if (shape.w - Z * 4 >= 2) {
dst_addr_1[0] = data.y;
}
if (shape.w - Z * 4 >= 3) {
dst_addr_2[0] = data.z;
}
}
}
__kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);

@ -40,8 +40,6 @@ using mindspore::schema::PrimitiveType_Activation;
namespace mindspore::kernel {
void ActivationOpenClKernel::InitBuffer() {}
int ActivationOpenClKernel::Init() {
in_size_ = in_tensors_[0]->shape().size();
out_size_ = out_tensors_[0]->shape().size();

@ -39,7 +39,7 @@ class ActivationOpenClKernel : public OpenCLKernel {
int Run() override;
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
cl_int4 GetImg2dShape();
void InitBuffer();
void InitBuffer() {}
private:
cl::Kernel kernel_;

@ -16,10 +16,10 @@
#include <cstring>
#include <algorithm>
#include <set>
#include<string>
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -145,31 +145,12 @@ int ArithmeticSelfOpenCLKernel::Init() {
int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; }
int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) {
if (number % 8 == 0 && max_divider >= 8) {
return number / 8;
}
if (number % 4 == 0 && 4 <= max_divider) {
return number / 4;
}
if (number % 2 == 0 && 2 <= max_divider) {
return number / 2;
}
for (int i = max_divider; i != 0; i--) {
if (number % i == 0) {
return i;
}
}
return RET_OK;
}
void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x);
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();

@ -20,6 +20,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/batchnorm.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -49,6 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
*img_size = vec;
return RET_OK;
}
int BatchNormOpenCLKernel::Init() {
auto in_format = op_format_;
if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) {
@ -79,31 +81,12 @@ int BatchNormOpenCLKernel::Init() {
int BatchNormOpenCLKernel::ReSize() { return RET_OK; }
int BatchnormGetBiggestDividerWithPriority(int number, int max_divider) {
if (number % 8 == 0 && 8 <= max_divider) {
return number / 8;
}
if (number % 4 == 0 && 4 <= max_divider) {
return number / 4;
}
if (number % 2 == 0 && 2 <= max_divider) {
return number / 2;
}
for (int i = max_divider; i != 0; i--) {
if (number % i == 0) {
return i;
}
}
return RET_OK;
}
void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
int x = std::min(BatchnormGetBiggestDividerWithPriority(global[0], max_divider), max_x);
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(BatchnormGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();
@ -111,6 +94,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
local->push_back(y);
local->push_back(z);
}
int BatchNormOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);

@ -14,12 +14,12 @@
* limitations under the License.
*/
#include <cstring>
#include <string>
#include <algorithm>
#include <set>
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/concat.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/concat.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -131,31 +131,12 @@ int ConcatOpenCLKernel::GetSumShape(std::vector<int> *sum_shape, std::vector<int
return RET_OK;
}
int ConcatGetBiggestDividerWithPriority(int number, int max_divider) {
if (number % 8 == 0 && max_divider >= 8) {
return number / 8;
}
if (number % 4 == 0 && 4 <= max_divider) {
return number / 4;
}
if (number % 2 == 0 && 2 <= max_divider) {
return number / 2;
}
for (int i = max_divider; i != 0; i--) {
if (number % i == 0) {
return i;
}
}
return RET_OK;
}
void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
int x = std::min(ConcatGetBiggestDividerWithPriority(global[0], max_divider), max_x);
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(ConcatGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();
@ -163,6 +144,7 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
local->push_back(y);
local->push_back(z);
}
int ConcatOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);

@ -19,6 +19,7 @@
#include <algorithm>
#include "src/common/utils.h"
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
@ -113,7 +114,7 @@ int ConvolutionOpenCLKernel::Init() {
return RET_OK;
}
int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000,
0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000};
@ -155,41 +156,16 @@ int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
}
if (use_fp16_) {
OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2);
ConvertConvWeight4DTo7D<float, float16_t>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6,
6, CI_, 2);
} else {
OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2);
ConvertConvWeight4DTo7D<float, float>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, 6,
CI_, 2);
}
return RET_OK;
}
template <typename SRC_T, typename DST_T>
int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) {
auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI);
auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_);
// OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4
for (size_t co = 0, src_idx = 0; co < CO_; ++co) {
for (size_t kh = 0; kh < KH; ++kh) {
for (size_t kw = 0; kw < KW; ++kw) {
for (size_t ci = 0; ci < CI_; ++ci) {
size_t co_outer = co / (CO_TILE * OGroup);
size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
size_t co_inner = co % CO_TILE;
size_t ci_outer = ci / CI_TILE;
size_t ci_inner = ci % CI_TILE;
size_t dst_idx =
(((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
CO_TILE +
co_inner;
packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
}
}
}
}
return RET_OK;
}
int ConvolutionOpenCLKernel::InitWeight() {
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
@ -206,20 +182,20 @@ int ConvolutionOpenCLKernel::InitWeight() {
// rearrange weight
if (use_winograd_) {
RearrangeWinogradWeight();
GenerateWinogradWeight();
} else {
auto weight_tensor = in_tensors_[1];
if (weight_tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16_) {
OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
} else {
OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1);
ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
}
} else {
if (use_fp16_) {
OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
} else {
OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1);
ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
}
}
}
@ -635,7 +611,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
" }\n"
"\n"
" int IH = input_shape.y, IW = input_shape.z;\n"
" int TILE_X = IW / 4;\n"
" int TILE_X = UP_DIV(IW, 4);\n"
" int tile_x = tile_xy % TILE_X;\n"
" int tile_y = tile_xy / TILE_X;\n"
"\n"
@ -764,6 +740,8 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
std::string code =
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
"\n"
"__constant sampler_t\n"
"smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
"\n"
@ -804,6 +782,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
" }\n"
" }\n"
"\n"
" int TILE_X = UP_DIV(OW, 4);\n"
" for (int x = 0; x < 4; x++)\n"
" {\n"
" FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
@ -822,14 +801,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
}
code +=
" int TILE_X = OW / 4;\n"
" int tile_x = tile_xy % TILE_X * 4;\n"
" int tile_y = tile_xy / TILE_X * 4;\n";
" int tile_x = tile_xy % TILE_X;\n"
" int tile_y = tile_xy / TILE_X;\n"
" int ow = tile_x * 4 + x;\n"
" int oh = tile_y * 4 + row;\n";
if (op_format_ == Format_NHWC4) {
code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n";
code += " if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n";
} else if (op_format_ == Format_NC4HW4) {
code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n";
code += " if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n";
}
code +=
@ -849,7 +829,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];
size_t local_c = GetBiggestDivider(global_c, max_z_size);
size_t local_c = GetMaxDivisor(global_c, max_z_size);
if (local_c == 0) {
MS_LOG(ERROR) << "Divide by zero";
return RET_ERROR;

@ -68,9 +68,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
int InitWeight();
int InitBias();
int RearrangeWinogradWeight();
template <typename SRC_T, typename DST_T>
int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);
int GenerateWinogradWeight();
std::string CodeGenConvolutionNHWC4();
std::string CodeGenConvolutionNC4HW4();
@ -90,29 +88,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
return attr_valid && channel_good && hw_good;
}
static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
std::vector<float> C(M * K);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {
float s = 0.0f;
for (int k = 0; k < N; ++k) {
s += A[i * N + k] * B[k * K + j];
}
C[i * K + j] = s;
}
}
return C;
}
static int GetBiggestDivider(int x, int y) {
for (int i = y; i != 0; i--) {
if (x % i == 0) {
return i;
}
}
return 1;
}
};
} // namespace mindspore::kernel

@ -20,6 +20,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/slice.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/slice.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -49,6 +50,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
*img_size = vec;
return RET_OK;
}
int SliceOpenCLKernel::Init() {
std::string kernel_name = "slice";
auto in_format = op_format_;
@ -77,28 +79,12 @@ int SliceOpenCLKernel::Init() {
int SliceOpenCLKernel::ReSize() { return RET_OK; }
int SliceGetBiggestDividerWithPriority(int number, int max_divider) {
if (number % 8 == 0 && 8 <= max_divider) {
return number / 8;
} else if (number % 4 == 0 && 4 <= max_divider) {
return number / 4;
} else if (number % 2 == 0 && 2 <= max_divider) {
return number / 2;
}
for (int i = max_divider; i != 0; i--) {
if (number % i == 0) {
return i;
}
}
return 1;
}
void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
int x = std::min(SliceGetBiggestDividerWithPriority(global[0], max_divider), max_x);
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(SliceGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();
@ -106,6 +92,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
local->push_back(y);
local->push_back(z);
}
int SliceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
@ -154,5 +141,4 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
} // namespace mindspore::kernel

@ -16,7 +16,6 @@
#include "src/runtime/kernel/opencl/utils.h"
#include <algorithm>
#include <string>
#include <vector>
#include "src/kernel_registry.h"
@ -34,26 +33,61 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
}
} // namespace mindspore::lite
namespace mindspore {
namespace kernel {
namespace mindspore::kernel {
int GetMaxDivisor(int x, int divisor) {
int i = divisor;
while (i > 0) {
if (x % i == 0) {
return i;
}
i--;
}
return 1;
}
int GetMaxDivisorStrategy0(int x, int divisor) {
if (divisor >= 8 && x % 8 == 0) {
return 8;
} else if (divisor >= 4 && x % 4 == 0) {
return 4;
} else if (divisor >= 2 && x % 2 == 0) {
return 2;
} else {
return GetMaxDivisor(x, divisor);
}
}
int GetMaxDivisorStrategy1(int x, int divisor) {
if (divisor >= 8 && x % 8 == 0) {
return x / 8;
} else if (divisor >= 4 && x % 4 == 0) {
return x / 4;
} else if (divisor >= 2 && x % 2 == 0) {
return x / 2;
} else {
return GetMaxDivisor(x, divisor);
}
}
std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) {
std::vector<size_t> result(3, 1);
std::vector<size_t> result(3);
for (int i = 0; i < 3; ++i) {
result[i] = AlignByN(global[i], local[i]);
result[i] = UP_ROUND(global[i], local[i]);
}
return result;
}
std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) {
size_t wg_z = GetBiggestDividerWithPriority(global[2], 8);
if (wg_z == 0) {
size_t local_z = GetMaxDivisorStrategy0(global[2], 8);
if (local_z == 0) {
MS_LOG(ERROR) << "Divide by zero";
return {};
}
size_t wg_xy_size = max_size / wg_z;
size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size);
size_t wg_y = std::min(wg_xy_size / wg_x, global[1]);
std::vector<size_t> local = {wg_x, wg_y, wg_z};
size_t local_xy = max_size / local_z;
size_t local_x = std::min(UP_DIV(global[0], 2), local_xy);
size_t local_y = std::min(local_xy / local_x, global[1]);
std::vector<size_t> local = {local_x, local_y, local_z};
return local;
}
@ -187,5 +221,4 @@ std::string CLErrorCode(cl_int error_code) {
return "Unknown OpenCL error code";
}
}
} // namespace kernel
} // namespace mindspore
} // namespace mindspore::kernel

@ -23,7 +23,7 @@
#include "utils/log_adapter.h"
#include "nnacl/op_base.h"
#include "src/lite_kernel.h"
#include "src/common//utils.h"
#include "src/common/utils.h"
namespace mindspore::lite {
kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@ -32,59 +32,14 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
namespace mindspore::kernel {
/**
* GetLocalSize
* @param number
* @param max_divider
* @return
*/
template <typename T, typename N>
T GetBiggestDividerWithPriority(T number, N max_divider) {
if (number % 8 == 0 && 8 <= max_divider) {
return (T)8;
}
if (number % 4 == 0 && 4 <= max_divider) {
return (T)4;
}
if (number % 2 == 0 && 2 <= max_divider) {
return (T)2;
}
for (int i = max_divider; i != 0; i--) {
if (number % i == 0) {
return (T)i;
}
}
return (T)1;
}
int GetMaxDivisor(int x, int divisor);
/**
* GetLocalSize
* @param n must be non negative
* @param divisor must be greater than zero
* @return
*/
template <typename T, typename N>
T DivideRoundUp(T n, N divisor) {
const T div = static_cast<T>(divisor);
const T q = n / div;
return n % div == 0 ? q : q + 1;
}
int GetMaxDivisorStrategy0(int x, int divisor);
/**
* GetLocalSize
* @param number
* @param n
* @return
*/
template <typename T, typename N>
T AlignByN(T number, N n) {
return DivideRoundUp(number, n) * n;
}
int GetMaxDivisorStrategy1(int x, int divisor);
// GetGlobalSize
std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global);
// GetLocalSize
std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size);
std::string CLErrorCode(cl_int error_code);
@ -108,6 +63,7 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c
}
}
}
template <class T1, class T2>
void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
int c4 = UP_DIV(channel, C4NUM);
@ -132,6 +88,7 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co
}
}
}
template <class T1, class T2>
void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
int c4 = UP_DIV(channel, C4NUM);
@ -152,6 +109,47 @@ void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c
}
}
template <class T>
std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) {
std::vector<T> C(M * K);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {
float s = 0.0f;
for (int k = 0; k < N; ++k) {
s += A[i * N + k] * B[k * K + j];
}
C[i * K + j] = s;
}
}
return C;
}
template <typename SRC_T, typename DST_T>
void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1,
size_t CI_TILE = 4, size_t CO_TILE = 4) {
auto origin_weight = reinterpret_cast<SRC_T *>(src);
auto packed_weight = reinterpret_cast<DST_T *>(dst);
auto CI_SLICES = UP_DIV(CI, CI_TILE);
for (size_t co = 0, src_idx = 0; co < CO; ++co) {
for (size_t kh = 0; kh < KH; ++kh) {
for (size_t kw = 0; kw < KW; ++kw) {
for (size_t ci = 0; ci < CI; ++ci) {
size_t co_outer = co / (CO_TILE * OGroup);
size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
size_t co_inner = co % CO_TILE;
size_t ci_outer = ci / CI_TILE;
size_t ci_inner = ci % CI_TILE;
size_t dst_idx =
(((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
CO_TILE +
co_inner;
packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
}
}
}
}
}
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_

@ -133,6 +133,8 @@ Format get_op_format(Format input_format) {
case Format_NHWC:
case Format_NHWC4:
return Format_NHWC4;
case Format_NCHW:
return Format_NHWC4;
default:
return Format_NC4HW4;
}
@ -249,7 +251,7 @@ TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
}
TEST_F(TestConvolutionOpenCL, simple_test0) {
TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) {
std::string attr =
"inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
@ -259,6 +261,34 @@ TEST_F(TestConvolutionOpenCL, simple_test0) {
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
}
TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) {
std::string attr =
"inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f};
float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
float bias_data[] = {0.0f, 0.0f};
float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f};
TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
}
TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) {
std::string attr =
"inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f};
float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
float bias_data[] = {0.0f, 0.0f};
float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f,
9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f};
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
expect_data);
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
expect_data);
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
}
TEST_F(TestConvolutionOpenCL, simple_test1) {
std::string attr =

Loading…
Cancel
Save