diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index 943b84c04d..574ffb8fd1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -54,6 +54,62 @@ __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write } WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); } +__kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + FLT4 data = (FLT4)(0.f); + __global float *src_addr = (__global float *)src_data; + __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; + __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; + __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; + if ((Z + 1) * 4 <= shape.w) { + data = TO_FLT4(((__global float4 *)src_addr_0)[0]); + } else { + if ((shape.w - Z * 4) >= 1) { + data.x = src_addr_0[0]; + } + if ((shape.w - Z * 4) >= 2) { + data.y = src_addr_1[0]; + } + if ((shape.w - Z * 4) >= 3) { + data.z = src_addr_2[0]; + } + } + WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); +} +__kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + FLT4 data = (FLT4)(0.f); + __global half *src_addr = (__global half *)src_data; + __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; + __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; + __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; + if ((Z + 1) * 4 <= shape.w) { + data = TO_FLT4(((__global half4 *)src_addr_0)[0]); + } else { + if ((shape.w - Z * 4) >= 1) { + data.x = src_addr_0[0]; + } + if ((shape.w - Z * 4) >= 2) { + data.y = src_addr_1[0]; + } + if ((shape.w - Z * 4) >= 3) { + data.z = src_addr_2[0]; + } + } + WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); +} __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, int4 shape) { int X = get_global_id(0); @@ -198,6 +254,64 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, } } } +__kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); + int offset = (X * shape.z + Y) * shape.w + Z * 4; + __global float *dst_addr = (__global float *)dst_data; + __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; + __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; + __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; + dst_addr += offset; + if ((Z + 1) * 4 <= shape.w) { + ((__global float4 *)dst_addr_0)[0] = data; + } else { + if (shape.w - Z * 4 >= 1) { + dst_addr_0[0] = data.x; + } + if (shape.w - Z * 4 >= 2) { + dst_addr_1[0] = data.y; + } + if (shape.w - Z * 4 >= 3) { + dst_addr_2[0] = data.z; + } + } +} +__kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); + int offset = (X * shape.z + Y) * shape.w + Z * 4; + __global half *dst_addr = (__global half *)dst_data; + __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; + __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; + __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; + dst_addr += offset; + if ((Z + 1) * 4 <= shape.w) { + ((__global half4 *)dst_addr_0)[0] = data; + } else { + if (shape.w - Z * 4 >= 1) { + dst_addr_0[0] = data.x; + } + if (shape.w - Z * 4 >= 2) { + dst_addr_1[0] = data.y; + } + if (shape.w - Z * 4 >= 3) { + dst_addr_2[0] = data.z; + } + } +} __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, int4 shape) { int X = get_global_id(0); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index bfe7b6b730..d617972399 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -40,8 +40,6 @@ using mindspore::schema::PrimitiveType_Activation; namespace mindspore::kernel { -void ActivationOpenClKernel::InitBuffer() {} - int ActivationOpenClKernel::Init() { in_size_ = in_tensors_[0]->shape().size(); out_size_ = out_tensors_[0]->shape().size(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h index cd22208468..58734a314a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h @@ -39,7 +39,7 @@ class ActivationOpenClKernel : public OpenCLKernel { int Run() override; int GetImageSize(size_t idx, std::vector *img_size) override; cl_int4 GetImg2dShape(); - void InitBuffer(); + void InitBuffer() {} private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc index bb52f82925..4f95798c75 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc @@ -16,10 +16,10 @@ #include #include #include -#include #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h" +#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -145,31 +145,12 @@ int ArithmeticSelfOpenCLKernel::Init() { int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; } -int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) { - if (number % 8 == 0 && max_divider >= 8) { - return number / 8; - } - if (number % 4 == 0 && 4 <= max_divider) { - return number / 4; - } - if (number % 2 == 0 && 2 <= max_divider) { - return number / 2; - } - - for (int i = max_divider; i != 0; i--) { - if (number % i == 0) { - return i; - } - } - return RET_OK; -} - void ArithmeticSelfGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { const int max_divider = 8; const int max_x = 4, max_y = 8; - int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x); + int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); int yz = max_size / x; - int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); + int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); local->clear(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index f3e59391e5..e07ea96fc5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -20,6 +20,7 @@ #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/batchnorm.h" +#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -49,6 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector *img_siz *img_size = vec; return RET_OK; } + int BatchNormOpenCLKernel::Init() { auto in_format = op_format_; if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { @@ -79,31 +81,12 @@ int BatchNormOpenCLKernel::Init() { int BatchNormOpenCLKernel::ReSize() { return RET_OK; } -int BatchnormGetBiggestDividerWithPriority(int number, int max_divider) { - if (number % 8 == 0 && 8 <= max_divider) { - return number / 8; - } - if (number % 4 == 0 && 4 <= max_divider) { - return number / 4; - } - if (number % 2 == 0 && 2 <= max_divider) { - return number / 2; - } - - for (int i = max_divider; i != 0; i--) { - if (number % i == 0) { - return i; - } - } - return RET_OK; -} - void BatchNormGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { const int max_divider = 8; const int max_x = 4, max_y = 8; - int x = std::min(BatchnormGetBiggestDividerWithPriority(global[0], max_divider), max_x); + int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); int yz = max_size / x; - int y = std::min(std::min(BatchnormGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); + int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); local->clear(); @@ -111,6 +94,7 @@ void BatchNormGetWorkGroup(const std::vector &global, std::vectorpush_back(y); local->push_back(z); } + int BatchNormOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; auto param = reinterpret_cast(this->op_parameter_); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index 5495d02404..4f44aa4723 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -14,12 +14,12 @@ * limitations under the License. */ #include -#include #include #include #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/concat.h" +#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/cl/concat.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -131,31 +131,12 @@ int ConcatOpenCLKernel::GetSumShape(std::vector *sum_shape, std::vector= 8) { - return number / 8; - } - if (number % 4 == 0 && 4 <= max_divider) { - return number / 4; - } - if (number % 2 == 0 && 2 <= max_divider) { - return number / 2; - } - - for (int i = max_divider; i != 0; i--) { - if (number % i == 0) { - return i; - } - } - return RET_OK; -} - void ConcatGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { const int max_divider = 8; const int max_x = 4, max_y = 8; - int x = std::min(ConcatGetBiggestDividerWithPriority(global[0], max_divider), max_x); + int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); int yz = max_size / x; - int y = std::min(std::min(ConcatGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); + int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); local->clear(); @@ -163,6 +144,7 @@ void ConcatGetWorkGroup(const std::vector &global, std::vector * local->push_back(y); local->push_back(z); } + int ConcatOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; auto param = reinterpret_cast(this->op_parameter_); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 725c8817ab..5701970747 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -19,6 +19,7 @@ #include #include "src/common/utils.h" #include "src/runtime/kernel/opencl/kernel/convolution.h" +#include "src/runtime/kernel/opencl/utils.h" #include "src/kernel_registry.h" #include "include/errorcode.h" @@ -113,7 +114,7 @@ int ConvolutionOpenCLKernel::Init() { return RET_OK; } -int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { +int ConvolutionOpenCLKernel::GenerateWinogradWeight() { constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; @@ -155,41 +156,16 @@ int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { } if (use_fp16_) { - OHWI2OHWIOGroupI4O4(encoded_weight.data(), 6, 6, 2); + ConvertConvWeight4DTo7D(reinterpret_cast(encoded_weight.data()), packed_weight_, CO_, 6, + 6, CI_, 2); } else { - OHWI2OHWIOGroupI4O4(encoded_weight.data(), 6, 6, 2); + ConvertConvWeight4DTo7D(reinterpret_cast(encoded_weight.data()), packed_weight_, CO_, 6, 6, + CI_, 2); } return RET_OK; } -template -int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) { - auto origin_weight = reinterpret_cast(weight_OHWI); - auto packed_weight = reinterpret_cast(packed_weight_); - - // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4 - for (size_t co = 0, src_idx = 0; co < CO_; ++co) { - for (size_t kh = 0; kh < KH; ++kh) { - for (size_t kw = 0; kw < KW; ++kw) { - for (size_t ci = 0; ci < CI_; ++ci) { - size_t co_outer = co / (CO_TILE * OGroup); - size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; - size_t co_inner = co % CO_TILE; - size_t ci_outer = ci / CI_TILE; - size_t ci_inner = ci % CI_TILE; - size_t dst_idx = - (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * - CO_TILE + - co_inner; - packed_weight[dst_idx] = static_cast(origin_weight[src_idx++]); - } - } - } - } - return RET_OK; -} - int ConvolutionOpenCLKernel::InitWeight() { auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); @@ -206,20 +182,20 @@ int ConvolutionOpenCLKernel::InitWeight() { // rearrange weight if (use_winograd_) { - RearrangeWinogradWeight(); + GenerateWinogradWeight(); } else { auto weight_tensor = in_tensors_[1]; if (weight_tensor->data_type() == kNumberTypeFloat16) { if (use_fp16_) { - OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); } else { - OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); } } else { if (use_fp16_) { - OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); } else { - OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); } } } @@ -635,7 +611,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { " }\n" "\n" " int IH = input_shape.y, IW = input_shape.z;\n" - " int TILE_X = IW / 4;\n" + " int TILE_X = UP_DIV(IW, 4);\n" " int tile_x = tile_xy % TILE_X;\n" " int tile_y = tile_xy / TILE_X;\n" "\n" @@ -764,6 +740,8 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { std::string code = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" + "\n" "__constant sampler_t\n" "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" "\n" @@ -804,6 +782,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { " }\n" " }\n" "\n" + " int TILE_X = UP_DIV(OW, 4);\n" " for (int x = 0; x < 4; x++)\n" " {\n" " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" @@ -822,14 +801,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { } code += - " int TILE_X = OW / 4;\n" - " int tile_x = tile_xy % TILE_X * 4;\n" - " int tile_y = tile_xy / TILE_X * 4;\n"; + " int tile_x = tile_xy % TILE_X;\n" + " int tile_y = tile_xy / TILE_X;\n" + " int ow = tile_x * 4 + x;\n" + " int oh = tile_y * 4 + row;\n"; if (op_format_ == Format_NHWC4) { - code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n"; + code += " if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n"; } else if (op_format_ == Format_NC4HW4) { - code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n"; + code += " if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n"; } code += @@ -849,7 +829,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; - size_t local_c = GetBiggestDivider(global_c, max_z_size); + size_t local_c = GetMaxDivisor(global_c, max_z_size); if (local_c == 0) { MS_LOG(ERROR) << "Divide by zero"; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 785e144336..2034257c66 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -68,9 +68,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int InitWeight(); int InitBias(); - int RearrangeWinogradWeight(); - template - int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup); + int GenerateWinogradWeight(); std::string CodeGenConvolutionNHWC4(); std::string CodeGenConvolutionNC4HW4(); @@ -90,29 +88,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { const bool hw_good = TILES_X_ * TILES_Y_ >= 16; return attr_valid && channel_good && hw_good; } - - static std::vector MatrixMultiply(const float A[], const float B[], int M, int N, int K) { - std::vector C(M * K); - for (int i = 0; i < M; ++i) { - for (int j = 0; j < K; ++j) { - float s = 0.0f; - for (int k = 0; k < N; ++k) { - s += A[i * N + k] * B[k * K + j]; - } - C[i * K + j] = s; - } - } - return C; - } - - static int GetBiggestDivider(int x, int y) { - for (int i = y; i != 0; i--) { - if (x % i == 0) { - return i; - } - } - return 1; - } }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc index bea1e6c8e3..7fb58b9e9f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc @@ -20,6 +20,7 @@ #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/opencl/kernel/slice.h" +#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/cl/slice.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -49,6 +50,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { *img_size = vec; return RET_OK; } + int SliceOpenCLKernel::Init() { std::string kernel_name = "slice"; auto in_format = op_format_; @@ -77,28 +79,12 @@ int SliceOpenCLKernel::Init() { int SliceOpenCLKernel::ReSize() { return RET_OK; } -int SliceGetBiggestDividerWithPriority(int number, int max_divider) { - if (number % 8 == 0 && 8 <= max_divider) { - return number / 8; - } else if (number % 4 == 0 && 4 <= max_divider) { - return number / 4; - } else if (number % 2 == 0 && 2 <= max_divider) { - return number / 2; - } - for (int i = max_divider; i != 0; i--) { - if (number % i == 0) { - return i; - } - } - return 1; -} - void SlcieGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { const int max_divider = 8; const int max_x = 4, max_y = 8; - int x = std::min(SliceGetBiggestDividerWithPriority(global[0], max_divider), max_x); + int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); int yz = max_size / x; - int y = std::min(std::min(SliceGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); + int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); local->clear(); @@ -106,6 +92,7 @@ void SlcieGetWorkGroup(const std::vector &global, std::vector *l local->push_back(y); local->push_back(z); } + int SliceOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; auto param = reinterpret_cast(this->op_parameter_); @@ -154,5 +141,4 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector & REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator); REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator); - } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 91375c045f..6cc9fb018b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -16,7 +16,6 @@ #include "src/runtime/kernel/opencl/utils.h" #include -#include #include #include "src/kernel_registry.h" @@ -34,26 +33,61 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, con } } // namespace mindspore::lite -namespace mindspore { -namespace kernel { +namespace mindspore::kernel { + +int GetMaxDivisor(int x, int divisor) { + int i = divisor; + while (i > 0) { + if (x % i == 0) { + return i; + } + i--; + } + return 1; +} + +int GetMaxDivisorStrategy0(int x, int divisor) { + if (divisor >= 8 && x % 8 == 0) { + return 8; + } else if (divisor >= 4 && x % 4 == 0) { + return 4; + } else if (divisor >= 2 && x % 2 == 0) { + return 2; + } else { + return GetMaxDivisor(x, divisor); + } +} + +int GetMaxDivisorStrategy1(int x, int divisor) { + if (divisor >= 8 && x % 8 == 0) { + return x / 8; + } else if (divisor >= 4 && x % 4 == 0) { + return x / 4; + } else if (divisor >= 2 && x % 2 == 0) { + return x / 2; + } else { + return GetMaxDivisor(x, divisor); + } +} + std::vector GetCommonGlobalSize(const std::vector &local, const std::vector &global) { - std::vector result(3, 1); + std::vector result(3); for (int i = 0; i < 3; ++i) { - result[i] = AlignByN(global[i], local[i]); + result[i] = UP_ROUND(global[i], local[i]); } return result; } std::vector GetCommonLocalSize(const std::vector &global, int max_size) { - size_t wg_z = GetBiggestDividerWithPriority(global[2], 8); - if (wg_z == 0) { + size_t local_z = GetMaxDivisorStrategy0(global[2], 8); + if (local_z == 0) { MS_LOG(ERROR) << "Divide by zero"; return {}; } - size_t wg_xy_size = max_size / wg_z; - size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size); - size_t wg_y = std::min(wg_xy_size / wg_x, global[1]); - std::vector local = {wg_x, wg_y, wg_z}; + size_t local_xy = max_size / local_z; + size_t local_x = std::min(UP_DIV(global[0], 2), local_xy); + size_t local_y = std::min(local_xy / local_x, global[1]); + std::vector local = {local_x, local_y, local_z}; return local; } @@ -187,5 +221,4 @@ std::string CLErrorCode(cl_int error_code) { return "Unknown OpenCL error code"; } } -} // namespace kernel -} // namespace mindspore +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index ed62ccf5d2..a15a245a9f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -23,7 +23,7 @@ #include "utils/log_adapter.h" #include "nnacl/op_base.h" #include "src/lite_kernel.h" -#include "src/common//utils.h" +#include "src/common/utils.h" namespace mindspore::lite { kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, const std::vector &out_tensors, @@ -32,59 +32,14 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, con namespace mindspore::kernel { -/** - * GetLocalSize - * @param number - * @param max_divider - * @return - */ -template -T GetBiggestDividerWithPriority(T number, N max_divider) { - if (number % 8 == 0 && 8 <= max_divider) { - return (T)8; - } - if (number % 4 == 0 && 4 <= max_divider) { - return (T)4; - } - if (number % 2 == 0 && 2 <= max_divider) { - return (T)2; - } - for (int i = max_divider; i != 0; i--) { - if (number % i == 0) { - return (T)i; - } - } - return (T)1; -} +int GetMaxDivisor(int x, int divisor); -/** - * GetLocalSize - * @param n must be non negative - * @param divisor must be greater than zero - * @return - */ -template -T DivideRoundUp(T n, N divisor) { - const T div = static_cast(divisor); - const T q = n / div; - return n % div == 0 ? q : q + 1; -} +int GetMaxDivisorStrategy0(int x, int divisor); -/** - * GetLocalSize - * @param number - * @param n - * @return - */ -template -T AlignByN(T number, N n) { - return DivideRoundUp(number, n) * n; -} +int GetMaxDivisorStrategy1(int x, int divisor); -// GetGlobalSize std::vector GetCommonGlobalSize(const std::vector &local, const std::vector &global); -// GetLocalSize std::vector GetCommonLocalSize(const std::vector &global, int max_size); std::string CLErrorCode(cl_int error_code); @@ -108,6 +63,7 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c } } } + template void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function &to_dtype) { int c4 = UP_DIV(channel, C4NUM); @@ -132,6 +88,7 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co } } } + template void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function &to_dtype) { int c4 = UP_DIV(channel, C4NUM); @@ -152,6 +109,47 @@ void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c } } +template +std::vector MatrixMultiply(const T A[], const T B[], int M, int N, int K) { + std::vector C(M * K); + for (int i = 0; i < M; ++i) { + for (int j = 0; j < K; ++j) { + float s = 0.0f; + for (int k = 0; k < N; ++k) { + s += A[i * N + k] * B[k * K + j]; + } + C[i * K + j] = s; + } + } + return C; +} + +template +void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, + size_t CI_TILE = 4, size_t CO_TILE = 4) { + auto origin_weight = reinterpret_cast(src); + auto packed_weight = reinterpret_cast(dst); + auto CI_SLICES = UP_DIV(CI, CI_TILE); + for (size_t co = 0, src_idx = 0; co < CO; ++co) { + for (size_t kh = 0; kh < KH; ++kh) { + for (size_t kw = 0; kw < KW; ++kw) { + for (size_t ci = 0; ci < CI; ++ci) { + size_t co_outer = co / (CO_TILE * OGroup); + size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; + size_t co_inner = co % CO_TILE; + size_t ci_outer = ci / CI_TILE; + size_t ci_inner = ci % CI_TILE; + size_t dst_idx = + (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * + CO_TILE + + co_inner; + packed_weight[dst_idx] = static_cast(origin_weight[src_idx++]); + } + } + } + } +} + } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index e6f56178c7..13b9a7d43f 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -133,6 +133,8 @@ Format get_op_format(Format input_format) { case Format_NHWC: case Format_NHWC4: return Format_NHWC4; + case Format_NCHW: + return Format_NHWC4; default: return Format_NC4HW4; } @@ -249,7 +251,7 @@ TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256 TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); } -TEST_F(TestConvolutionOpenCL, simple_test0) { +TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) { std::string attr = "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; @@ -259,6 +261,34 @@ TEST_F(TestConvolutionOpenCL, simple_test0) { TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } +TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) { + std::string attr = + "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; + float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f}; + float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + float bias_data[] = {0.0f, 0.0f}; + float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f}; + TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); +} + +TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) { + std::string attr = + "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; + float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f}; + float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + float bias_data[] = {0.0f, 0.0f}; + float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f, + 9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f}; + TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, + expect_data); + TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, + expect_data); + TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, + expect_data); + TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, + expect_data); +} TEST_F(TestConvolutionOpenCL, simple_test1) { std::string attr =