From 4140b13ce12f7e17eefa1bac15fde745bf0b5730 Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Wed, 2 Sep 2020 16:57:59 +0800 Subject: [PATCH] opencl convolution support NC4HW4 --- .../kernel/opencl/kernel/convolution.cc | 349 ++++++++++++++---- .../kernel/opencl/kernel/convolution.h | 4 +- .../kernel/opencl/convolution_tests.cc | 309 +++++++++++----- 3 files changed, 485 insertions(+), 177 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 8200549b6d..61db5700fe 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -33,10 +33,18 @@ namespace mindspore::kernel { int ConvolutionOpenCLKernel::Init() { static int init_count = 0; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - use_fp16_ = ocl_runtime->GetFp16Enable(); auto allocator = ocl_runtime->GetAllocator(); std::set build_options; init_count++; + use_fp16_ = ocl_runtime->GetFp16Enable(); + + if (op_format_ != schema::Format_NHWC4 && op_format_ != schema::Format_NC4HW4) { + MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!"; + } + in_ori_format_ = in_tensors_[0]->GetFormat(); + out_ori_format_ = out_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); CI = in_tensors_[0]->Channel(); IH = in_tensors_[0]->Height(); @@ -70,7 +78,8 @@ int ConvolutionOpenCLKernel::Init() { ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options); } else { std::string program_name = "convolution" + std::to_string(init_count); - ocl_runtime->LoadSource(program_name, CodeGenConvolution()); + std::string source = op_format_ == schema::Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4(); + ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options); } @@ -91,10 +100,7 @@ int ConvolutionOpenCLKernel::Init() { } this->InitBuffer(); - in_ori_format_ = in_tensors_[0]->GetFormat(); - in_tensors_[0]->SetFormat(schema::Format_NHWC4); - out_ori_format_ = out_tensors_[0]->GetFormat(); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); + MS_LOG(DEBUG) << "Convolution Init Done!"; return RET_OK; } @@ -282,6 +288,12 @@ int ConvolutionOpenCLKernel::Run() { ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + if (op_format_ == schema::Format_NC4HW4) { + cl_int4 input_shape = {1, IH, IW, CI_SLICES}; + cl_int4 output_shape = {1, OH, OW, CO_SLICES}; + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, input_shape); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, output_shape); + } } if (use_winograd_) { @@ -297,7 +309,7 @@ int ConvolutionOpenCLKernel::Run() { return RET_OK; } -std::string ConvolutionOpenCLKernel::CodeGenConvolution() { +std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { auto param = reinterpret_cast(op_parameter_); const size_t CI_ALIGN = CI_SLICES * C4NUM; const size_t CO_ALIGN = CO_SLICES * C4NUM; @@ -344,8 +356,8 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { "{\n"; code += - " int oh = get_global_id(0); // [0, OH)\n" - " int ow = get_global_id(1); // [0, OW)\n" + " int ow = get_global_id(0); // [0, OW)\n" + " int oh = get_global_id(1); // [0, OH)\n" " int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n" "\n" " if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" @@ -396,66 +408,237 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { return code; } +std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { + auto param = reinterpret_cast(op_parameter_); + const size_t KH = param->kernel_h_; + const size_t KW = param->kernel_w_; + const size_t strideH = param->stride_h_; + const size_t strideW = param->stride_w_; + const size_t padTop = param->pad_u_; + const size_t padBottom = param->pad_d_; + const size_t padLeft = param->pad_l_; + + std::string code; + + if (use_fp16_) { + code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + } + + code += + "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" + "\n" + "__kernel void Convolution(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " __global FLT4 *weight,\n" + " __global FLT4 *bias,\n" + " const int4 input_shape,\n" + " const int4 output_shape)\n" + "{\n" + " int ow = get_global_id(0) * 2;\n" + " int oh = get_global_id(1);\n" + " int co_slice = get_global_id(2);\n" + "\n" + " int CI_SLICES = input_shape.w;\n" + " int CO_SLICES = output_shape.w;\n\n"; + + code += " #define IH " + std::to_string(IH) + "\n"; + code += " #define IW " + std::to_string(IW) + "\n"; + code += " #define OH " + std::to_string(OH) + "\n"; + code += " #define OW " + std::to_string(OW) + "\n"; + code += " #define KH " + std::to_string(KH) + "\n"; + code += " #define KW " + std::to_string(KW) + "\n"; + code += " #define strideH " + std::to_string(strideH) + "\n"; + code += " #define strideW " + std::to_string(strideW) + "\n"; + code += " #define padTop " + std::to_string(padTop) + "\n"; + code += " #define padLeft " + std::to_string(padLeft) + "\n\n"; + + code += + " if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" + " return;\n\n"; + + bool check_ow = (OW % 2) == 1; + if (check_ow) { + code += + " int last_is_double = 1;\n" + " if (ow + 1 >= OW)\n" + " last_is_double = 0;\n\n"; + } + + code += + " FLT4 out0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " FLT4 out1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " __global FLT4 *w = weight + co_slice * KH * KW * CI_SLICES * 4;\n" + "\n" + " for (int kh = 0; kh < KH; ++kh)\n" + " {\n" + " int ih = kh + oh * strideH - padTop;\n" + " for (int kw = 0; kw < KW; ++kw)\n" + " {\n"; + + if (padTop || padBottom) { + code += + "if (ih >= 0 && ih < IH)\n" + "{\n"; + } + + code += " int iw0 = kw + (ow + 0) * strideW - padLeft;\n"; + if (check_ow) { + code += + " if (last_is_double)\n" + " {\n"; + } + + code += + " int iw1 = kw + (ow + 1) * strideW - padLeft;\n" + " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" + " {\n" + " FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, ci_slice * IH + ih));\n" + " out0 += w[0] * in0.x;\n" + " out0 += w[1] * in0.y;\n" + " out0 += w[2] * in0.z;\n" + " out0 += w[3] * in0.w;\n" + " FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(iw1, ci_slice * IH + ih));\n" + " out1 += w[0] * in1.x;\n" + " out1 += w[1] * in1.y;\n" + " out1 += w[2] * in1.z;\n" + " out1 += w[3] * in1.w;\n" + " w += 4;\n" + " }\n"; + if (check_ow) { + code += + " }\n" + " else\n" + " {\n" + " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" + " {\n" + " FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, ci_slice * IH + ih));\n" + " out0 += w[0] * in0.x;\n" + " out0 += w[1] * in0.y;\n" + " out0 += w[2] * in0.z;\n" + " out0 += w[3] * in0.w;\n" + " w += 4;\n" + " }\n" + " }\n"; + } + if (padTop || padBottom) { + code += + "}\n" + "else\n" + "{\n" + " w += CI_SLICES * 4;\n" + "}\n"; + } + code += + " }\n" + " }\n\n"; + + code += " out0 = out0 + bias[co_slice];\n"; + if (param->act_type_ == ActType_Relu) { + code += " out0 = max(out0, (FLT4)(0.0f));\n"; + } else if (param->act_type_ == ActType_Relu6) { + code += " out0 = clamp(out0, (FLT4)(0.0f), (FLT4)(6.0f));\n"; + } + code += " WRITE_IMAGE(output, (int2)(ow + 0, co_slice * OH + oh), out0);\n"; + + if (check_ow) { + code += + " if (last_is_double)" + " {\n"; + } + code += " out1 = out1 + bias[co_slice];\n"; + if (param->act_type_ == ActType_Relu) { + code += " out1 = max(out1, (FLT4)(0.0f));\n"; + } else if (param->act_type_ == ActType_Relu6) { + code += " out1 = clamp(out1, (FLT4)(0.0f), (FLT4)(6.0f));\n"; + } + code += " WRITE_IMAGE(output, (int2)(ow + 1, co_slice * OH + oh), out1);\n"; + if (check_ow) { + code += "}\n"; + } + code += "}\n"; + + return code; +} + std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { - return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" - "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" - "#define PAD 1\n" - "\n" - "__constant sampler_t\n" - "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" - "\n" - "constant FLT Bt[36] = {\n" - " 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n" - " 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n" - " 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n" - " 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f,\n" - " 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f,\n" - " 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f,\n" - "};\n" - "\n" - "__kernel void Winograd4x4To36(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " int4 input_shape, // N H W CI_SLICES\n" - " int4 output_shape) // N 36 H/4*W/4 CI_SLICES\n" - "{\n" - " int tile_xy = get_global_id(0);\n" - " int row = get_global_id(1);\n" - " int slice = get_global_id(2);\n" - "\n" - " int TILE_XY = output_shape.z;\n" - " int SLICES = input_shape.w;\n" - " if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES)\n" - " {\n" - " return;\n" - " }\n" - "\n" - " int IH = input_shape.y, IW = input_shape.z;\n" - " int TILE_X = IW / 4;\n" - " int tile_x = tile_xy % TILE_X;\n" - " int tile_y = tile_xy / TILE_X;\n" - "\n" - " constant FLT *Bt_row = Bt + row * 6;\n" - " FLT4 BtD_row[6] = {0};\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " int y_idx = tile_y * 4 - PAD + y;\n" - " for (int x = 0; x < 6; x++)\n" - " {\n" - " int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n" - " BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_none, (int2)(x_idx, y_idx));\n" - " }\n" - " }\n" - "\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " for (int x = 0; x < 6; x++)\n" - " {\n" - " acc += BtD_row[x] * Bt[y * 6 + x];\n" - " }\n" - " WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36\n" - " }\n" - "}"; + std::string code; + code += + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" + "#define PAD 1\n" + "\n" + "__constant sampler_t\n" + "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" + "\n" + "constant FLT Bt[36] = {\n" + " 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n" + " 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n" + " 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n" + " 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f,\n" + " 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f,\n" + " 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f,\n" + "};\n" + "\n" + "__kernel void Winograd4x4To36(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " int4 input_shape, // N H W CI_SLICES\n" + " int4 output_shape) // N 36 H/4*W/4 CI_SLICES\n" + "{\n" + " int tile_xy = get_global_id(0);\n" + " int row = get_global_id(1);\n" + " int slice = get_global_id(2);\n" + "\n" + " int TILE_XY = output_shape.z;\n" + " int SLICES = input_shape.w;\n" + " if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES)\n" + " {\n" + " return;\n" + " }\n" + "\n" + " int IH = input_shape.y, IW = input_shape.z;\n" + " int TILE_X = IW / 4;\n" + " int tile_x = tile_xy % TILE_X;\n" + " int tile_y = tile_xy / TILE_X;\n" + "\n" + " constant FLT *Bt_row = Bt + row * 6;\n" + " FLT4 BtD_row[6] = {0};\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " int y_idx = tile_y * 4 - PAD + y;\n"; + + if (op_format_ == schema::Format_NHWC4) { + code += + " for (int x = 0; x < 6; x++)\n" + " {\n" + " int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n"; + } else if (op_format_ == schema::Format_NC4HW4) { + code += + " if(y_idx < 0 || y_idx >= IH)\n" + " {\n" + " continue;\n" + " }\n" + " y_idx += slice * IH;\n" + " for (int x = 0; x < 6; x++)\n" + " {\n" + " int x_idx = tile_x * 4 - PAD + x;\n"; + } + + code += + " BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_none, (int2)(x_idx, y_idx));\n" + " }\n" + " }\n" + "\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " for (int x = 0; x < 6; x++)\n" + " {\n" + " acc += BtD_row[x] * Bt[y * 6 + x];\n" + " }\n" + " WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36\n" + " }\n" + "}"; + return code; } std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { @@ -602,8 +785,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { code += " int TILE_X = OW / 4;\n" " int tile_x = tile_xy % TILE_X * 4;\n" - " int tile_y = tile_xy / TILE_X * 4;\n" - " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H width=WC\n" + " int tile_y = tile_xy / TILE_X * 4;\n"; + + if (op_format_ == schema::Format_NHWC4) { + code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n"; + } else if (op_format_ == schema::Format_NC4HW4) { + code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n"; + } + + code += " }\n" "}"; return code; @@ -632,18 +822,29 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std local_h = global_h / 2; } - if (OW * CO_SLICES > 65536) { - local_w = 4; + if (op_format_ == schema::Format_NHWC4) { + if (OW * CO_SLICES > 65536) { + local_w = 4; + } } global->clear(); - global->push_back(UP_DIV(OH, local_h) * local_h); global->push_back(UP_DIV(OW, local_w) * local_w); + global->push_back(UP_DIV(OH, local_h) * local_h); global->push_back(UP_DIV(CO_SLICES, local_c) * local_c); local->clear(); - local->push_back(local_h); local->push_back(local_w); + local->push_back(local_h); local->push_back(local_c); + + if (op_format_ == schema::Format_NC4HW4) { + // calculate 2 FLT4 along width per work-item + global->at(0) = UP_DIV(global->at(0), 2); + if (local->at(0) > global->at(0)) { + local->at(0) = global->at(0); + } + } + return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 851fd09a13..fc3cd21880 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -64,7 +64,9 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { cl::Kernel kernel_conv; cl::Kernel kernel_36to4x4; - std::string CodeGenConvolution(); + std::string CodeGenConvolutionNHWC4(); + std::string CodeGenConvolutionNC4HW4(); + std::string CodeGenWinograd4x4To36(); std::string CodeGenWinogradConvolution(); std::string CodeGenWinograd36To4x4(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index 457021b8a7..8e72a0c935 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -26,165 +26,270 @@ using mindspore::kernel::ConvolutionOpenCLKernel; using mindspore::kernel::LiteKernel; using mindspore::kernel::SubGraphOpenCLKernel; +using mindspore::lite::tensor::Tensor; +using mindspore::schema::Format; +using mindspore::schema::Format_KHWC; +using mindspore::schema::Format_NC4HW4; +using mindspore::schema::Format_NCHW; +using mindspore::schema::Format_NHWC; +using mindspore::schema::Format_NHWC4; +using mindspore::schema::NodeType_ValueNode; namespace mindspore { class TestConvolutionOpenCL : public mindspore::CommonTest {}; -void LoadData(void *dst, size_t dst_size, const std::string &file_path) { - if (file_path.empty()) { - memset(dst, 0x00, dst_size); +void LoadData(Tensor *tensor, const float *src) { + if (tensor->data_type() == kNumberTypeFloat16) { + auto num = tensor->Size() / 2; + auto tensor_data = reinterpret_cast(tensor->Data()); + for (int i = 0; i < num; ++i) { + tensor_data[i] = Float32ToShort(src[i]); + } } else { - auto src_data = mindspore::lite::ReadFile(file_path.c_str(), &dst_size); - memcpy(dst, src_data, dst_size); + memcpy(tensor->Data(), src, tensor->Size()); } } -void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path, const TypeId data_type, - const float atol) { - size_t output_size = output_tensor->Size(); - auto output_data_ori = output_tensor->Data(); - auto expect_data_ori = mindspore::lite::ReadFile(file_path.c_str(), &output_size); - std::vector output_data_vec(output_tensor->ElementsC4Num()); - std::vector expect_data_vec(output_tensor->ElementsC4Num()); - float *output_data, *expect_data; - if (data_type == kNumberTypeFloat16) { - for (int i = 0; i < output_data_vec.size(); ++i) { - output_data_vec[i] = ShortToFloat32(reinterpret_cast(output_data_ori)[i]); - expect_data_vec[i] = ShortToFloat32(reinterpret_cast(expect_data_ori)[i]); +void CompareOutput(Tensor *output, const float *expect_data, const float atol) { + auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4; + std::vector output_data(num); + if (output->data_type() == kNumberTypeFloat16) { + auto output_data_fp16 = reinterpret_cast(output->Data()); + for (int i = 0; i < output_data.size(); ++i) { + output_data[i] = ShortToFloat32((output_data_fp16[i])); } - output_data = output_data_vec.data(); - expect_data = expect_data_vec.data(); } else { - output_data = reinterpret_cast(output_data_ori); - expect_data = reinterpret_cast(expect_data_ori); + memcpy(output_data.data(), output->Data(), output->Size()); } - printf("\noutput[0:10]:"); - for (int i = 0; i < 10; i++) { - printf("%d:%.3f ", i, output_data[i]); + printf("output:"); + for (int i = 0; i < std::min(10, output->ElementsNum()); i++) { + printf("%7.3f ", output_data[i]); } printf("\n"); - for (int i = 0; i < output_tensor->ElementsNum(); ++i) { - if (std::fabs(output_data[i] - expect_data[i]) > atol) { - printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]); - printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]); - printf("error at idx[%d] expect=%.3f output=%.3f\n\n\n", i, expect_data[i], output_data[i]); - return; + float max_err = 0.0f; + std::array idx_5d{}; + int idx = -1; + auto SLICES = UP_DIV(output->Channel(), 4); + int I = 1, J = 1, K = 1, L = 1, M = 1; + switch (output->GetFormat()) { + case Format_NHWC: + I = output->Batch(), J = output->Height(), K = output->Width(), L = output->Channel(); + break; + case Format_NCHW: + I = output->Batch(), J = output->Channel(), K = output->Height(), L = output->Width(); + break; + case Format_NHWC4: + I = output->Batch(), J = output->Height(), K = output->Width(), L = SLICES, M = 4; + break; + case Format_NC4HW4: + I = output->Batch(), J = SLICES, K = output->Height(), L = output->Width(), M = 4; + break; + default: + break; + } + + int cn = 0; + for (int i = 0; i < I; ++i) { + for (int j = 0; j < J; ++j) { + for (int k = 0; k < K; ++k) { + for (int l = 0; l < L; ++l) { + for (int m = 0; m < M; ++m) { + auto err = std::fabs(output_data[cn] - expect_data[cn]); + if (err > max_err) { + max_err = err; + idx_5d = {i, j, k, l, m}; + idx = cn; + } + cn++; + } + } + } } } - printf("COMPARE SUCCESS!\n\n\n"); + + float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx])); + if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) { + printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]); + } else { + printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]); + } + printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx], + max_err, relative_err * 100); + + if (max_err > atol) { + FAIL(); + } else { + printf("COMPARE SUCCESS!\n\n"); + } +} + +Format get_op_format(Format input_format) { + switch (input_format) { + case Format_NHWC: + case Format_NHWC4: + return Format_NHWC4; + default: + return Format_NC4HW4; + } } -void TEST_MAIN(schema::Format input_format, schema::Format output_format, const TypeId data_type, - const std::string &data_path, std::string attr_str) { - auto param = new (std::nothrow) ConvParameter; +void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, const TypeId data_type, + const float atol, const float *input_data, const float *weight_data, const float *bias_data, + const float *expect_data) { + auto param = std::make_unique(); if (param == nullptr) { + MS_LOG(ERROR) << "ConvParameter create error."; return; } - sscanf(attr_str.c_str(), + sscanf(attr.c_str(), "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_" "dilationHW_%dx%d", ¶m->input_batch_, ¶m->input_h_, ¶m->input_w_, ¶m->input_channel_, ¶m->output_batch_, ¶m->output_h_, ¶m->output_w_, ¶m->output_channel_, ¶m->kernel_h_, ¶m->kernel_w_, ¶m->stride_h_, ¶m->stride_w_, ¶m->pad_u_, ¶m->pad_d_, ¶m->pad_l_, ¶m->pad_r_, ¶m->dilation_h_, ¶m->dilation_w_); - auto testcase_path = data_path + "/" + attr_str + "/"; - auto input_file = testcase_path + (input_format == schema::Format_NHWC4 ? "input_NHWC4.bin" : "input_NHWC.bin"); - auto weight_file = testcase_path + "weight_OHWI.bin"; - auto bias_file = testcase_path + "bias_C.bin"; - auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin"); - std::cout << "input_file :" << input_file << std::endl; - std::cout << "weight_file :" << weight_file << std::endl; - std::cout << "bias_file :" << bias_file << std::endl; - std::cout << "expect_file :" << expect_file << std::endl; - std::cout << "initialize OpenCLRuntime and OpenCLAllocator"; + MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); auto allocator = ocl_runtime->GetAllocator(); - std::cout << "create Tensors"; + MS_LOG(DEBUG) << "create Tensors"; std::vector input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_}; std::vector weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_}; std::vector bias_shape = {param->output_channel_}; std::vector output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; - auto tensor_type = schema::NodeType_ValueNode; - auto input_tensor = lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type); - auto weight_tensor = lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type); - auto bias_tensor = lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensor_type); - auto output_tensor = lite::tensor::Tensor(data_type, output_shape, output_format, tensor_type); - std::vector inputs{&input_tensor, &weight_tensor, &bias_tensor}; - std::vector outputs{&output_tensor}; - - std::cout << "allocate memory and initialize weight/bias"; - weight_tensor.MallocData(); - bias_tensor.MallocData(); - LoadData(weight_tensor.Data(), weight_tensor.Size(), weight_file); - LoadData(bias_tensor.Data(), bias_tensor.Size(), bias_file); - - std::cout << "create OpenCL Kernel"; - auto kernel = ConvolutionOpenCLKernel(reinterpret_cast(param), inputs, outputs); + auto input = Tensor(data_type, input_shape, input_format, NodeType_ValueNode); + auto weight = Tensor(data_type, weight_shape, Format_KHWC, NodeType_ValueNode); + auto bias = Tensor(data_type, bias_shape, Format_KHWC, NodeType_ValueNode); + auto output = Tensor(data_type, output_shape, output_format, NodeType_ValueNode); + + MS_LOG(DEBUG) << "allocate memory and initialize weight/bias"; + weight.MallocData(); + bias.MallocData(); + LoadData(&weight, weight_data); + LoadData(&bias, bias_data); + + MS_LOG(DEBUG) << "create OpenCL Kernel"; + auto kernel = + ConvolutionOpenCLKernel(reinterpret_cast(param.release()), {&input, &weight, &bias}, {&output}); + kernel.SetFormatType(get_op_format(input_format)); kernel.Init(); - std::cout << "create SubGraph"; - auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input_tensor}, outputs, {&kernel}, {&kernel}, {&kernel}); + MS_LOG(DEBUG) << "create SubGraph"; + auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input}, {&output}, {&kernel}, {&kernel}, {&kernel}); if (sub_graph == nullptr) { return; } - input_tensor.MallocData(allocator); // before MapBuffer() + input.MallocData(allocator); sub_graph->Init(); - LoadData(input_tensor.Data(), input_tensor.Size(), input_file); // after MapBuffer() - if (data_type == kNumberTypeFloat16) { - printf("input[0] =%.3f\n", ShortToFloat32(reinterpret_cast(input_tensor.Data())[0])); - printf("weight[0]=%.3f\n", ShortToFloat32(reinterpret_cast(weight_tensor.Data())[0])); - printf("bias[0] =%.3f\n", ShortToFloat32(reinterpret_cast(bias_tensor.Data())[0])); - } else { - printf("input[0] =%.3f\n", reinterpret_cast(input_tensor.Data())[0]); - printf("weight[0]=%.3f\n", reinterpret_cast(weight_tensor.Data())[0]); - printf("bias[0] =%.3f\n", reinterpret_cast(bias_tensor.Data())[0]); - } + LoadData(&input, input_data); sub_graph->Run(); - MyCompareOutput(&output_tensor, expect_file, data_type, (data_type == kNumberTypeFloat16 ? 0.7f : 0.1f)); - - std::cout << "release resources"; - weight_tensor.FreeData(); - bias_tensor.FreeData(); - input_tensor.SetData(nullptr); - output_tensor.SetData(nullptr); - weight_tensor.SetData(nullptr); - bias_tensor.SetData(nullptr); - delete param; + CompareOutput(&output, expect_data, atol); + + MS_LOG(DEBUG) << "release resources"; + weight.FreeData(); + bias.FreeData(); + input.SetData(nullptr); + output.SetData(nullptr); delete sub_graph; lite::opencl::OpenCLRuntime::DeleteInstance(); } -TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp32) { - TEST_MAIN( - schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat32, "testcases/mobilenetv2_fp32/", - "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" - "1x1"); +void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, const TypeId data_type, + const float atol, const std::string &data_path) { + auto testcase_path = data_path + "/" + attr + "/"; + std::map format_str{ + {Format_NCHW, "NCHW"}, {Format_NHWC, "NHWC"}, {Format_NHWC4, "NHWC4"}, {Format_NC4HW4, "NC4HW4"}}; + auto input_file = testcase_path + "input_" + format_str[input_format] + ".bin"; + auto weight_file = testcase_path + "weight_OHWI.bin"; + auto bias_file = testcase_path + "bias_C.bin"; + auto expect_file = testcase_path + "expect_" + format_str[output_format] + ".bin"; + MS_LOG(DEBUG) << "input_file :" << input_file; + MS_LOG(DEBUG) << "weight_file :" << weight_file; + MS_LOG(DEBUG) << "bias_file :" << bias_file; + MS_LOG(DEBUG) << "expect_file :" << expect_file; + + size_t dst_size; + auto input_data = reinterpret_cast(mindspore::lite::ReadFile(input_file.c_str(), &dst_size)); + auto weight_data = reinterpret_cast(mindspore::lite::ReadFile(weight_file.c_str(), &dst_size)); + auto bias_data = reinterpret_cast(mindspore::lite::ReadFile(bias_file.c_str(), &dst_size)); + auto expect_data = reinterpret_cast(mindspore::lite::ReadFile(expect_file.c_str(), &dst_size)); + printf("input [0-3]: %7.3f %7.3f %7.3f\n", input_data[0], input_data[1], input_data[2]); + printf("weight[0-3]: %7.3f %7.3f %7.3f\n", weight_data[0], weight_data[1], weight_data[2]); + printf("bias [0-3]: %7.3f %7.3f %7.3f\n", bias_data[0], bias_data[1], bias_data[2]); + printf("expect[0-3]: %7.3f %7.3f %7.3f\n", expect_data[0], expect_data[1], expect_data[2]); + + TEST_MAIN(attr, input_format, output_format, data_type, atol, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp16) { - TEST_MAIN( - schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat16, "testcases/mobilenetv2_fp16/", +TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { + std::string attr = "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" - "1x1"); + "1x1"; + TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/"); + TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/"); + TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/"); + TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/"); +} + +TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { + std::string attr = + "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_" + "1x1"; + TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/"); + TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); + TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/"); + TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); +} + +TEST_F(TestConvolutionOpenCL, simple_test0) { + std::string attr = + "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; + float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + float bias_data[] = {0.0f, 0.0f}; + float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f}; + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); +} + +TEST_F(TestConvolutionOpenCL, simple_test1) { + std::string attr = + "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; + float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + float bias_data[] = {0.5f, -0.5f}; + float expect_data[] = {2.5f, 3.5f, 8.5f, 17.5f, 14.5f, 31.5f, 20.5f, 45.5f}; + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp32) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat32, "testcases/test_fp32/", - "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); +TEST_F(TestConvolutionOpenCL, simple_test2) { + std::string attr = + "inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; + float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + float bias_data[] = {0.0f}; + float expect_data[] = {28.0f, 18.0f, 22.0f, 13.0f}; + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } -TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp16) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat16, "testcases/test_fp16/", - "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); +TEST_F(TestConvolutionOpenCL, simple_test3) { + std::string attr = + "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; + float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; + float bias_data[] = {0.5f, -0.5f}; + float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f}; + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); + TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); } } // namespace mindspore