diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index c2e5e9a447..c7e6cf8efa 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -18,108 +18,203 @@ #include #include #include "src/runtime/kernel/opencl/kernel/convolution.h" -#include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc" #include "src/kernel_registry.h" +#include "include/errorcode.h" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { int ConvolutionOpenCLKernel::Init() { - MS_LOG(INFO) << "ConvolutionOpenCLKernel::Init()"; - - if (inputs_[0]->Batch() != 1 || outputs_[0]->Batch() != 1) { - MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!"; - } - - auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC; - auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4; - if (!io_NHWC && !io_NHWC4) { - MS_LOG(ERROR) << "input and output data_format is invalid!"; - } - io_dataformat_ = inputs_[0]->GetFormat(); - - if (inputs_[1]->GetFormat() != schema::Format_KHWC) { - MS_LOG(ERROR) << "weight data_format is invalid!"; - } - + std::cout << "ConvolutionOpenCLKernel::Init()\n"; std::set build_options; - std::string source = convolution_source_fp32; + std::string source = CodeGen(); std::string program_name = "convolution"; - std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI"; + std::string kernel_name = "convolution"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); this->InitBuffer(); + return RET_OK; +} + +std::string ConvolutionOpenCLKernel::CodeGen() { + auto param = reinterpret_cast(opParameter); + + auto input_tensor = inputs_[0]; + auto output_tensor = outputs_[0]; + const size_t CI = input_tensor->Channel(); + const size_t CI_SLICES = UP_DIV(CI, C4NUM); + const size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; + const size_t IH = input_tensor->Height(); + const size_t IW = input_tensor->Width(); + const size_t CO = output_tensor->Channel(); + const size_t CO_SLICES = UP_DIV(CO, C4NUM); + const size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; + const size_t OH = output_tensor->Height(); + const size_t OW = output_tensor->Width(); + const size_t KH = param->kernel_h_; + const size_t KW = param->kernel_w_; + const size_t strideH = param->stride_h_; + const size_t strideW = param->stride_w_; + const size_t padTop = param->pad_u_; + const size_t padBottom = param->pad_d_; + const size_t padLeft = param->pad_l_; + const size_t padRight = param->pad_r_; + + std::string code; + code += "#define CI_TILE 4\n"; + code += "#define CO_TILE 4\n\n"; + code += "#define CI " + std::to_string(CI_ALIGN) + "\n"; + code += "#define IH " + std::to_string(IH) + "\n"; + code += "#define IW " + std::to_string(IW) + "\n"; + code += "#define CO " + std::to_string(CO_ALIGN) + "\n"; + code += "#define OH " + std::to_string(OH) + "\n"; + code += "#define OW " + std::to_string(OW) + "\n"; + code += "#define KH " + std::to_string(KH) + "\n"; + code += "#define KW " + std::to_string(KW) + "\n"; + code += "#define strideH " + std::to_string(strideH) + "\n"; + code += "#define strideW " + std::to_string(strideW) + "\n"; + code += "#define padTop " + std::to_string(padTop) + "\n"; + code += "#define padBottom " + std::to_string(padBottom) + "\n"; + code += "#define padLeft " + std::to_string(padLeft) + "\n"; + code += "#define padRight " + std::to_string(padRight) + "\n"; + code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n"; + code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n"; + +#ifdef ENABLE_FP16 + code += + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#define FLT4 half4\n" + "#define READ_FLT4 read_imageh\n" + "#define WRITE_FLT4 write_imageh\n\n"; +#else + code += + "#define FLT4 float4\n" + "#define READ_FLT4 read_imagef\n" + "#define WRITE_FLT4 write_imagef\n\n"; +#endif + + code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n"; + + code += + "__kernel void convolution(__read_only image2d_t input,\n" + " __global FLT4 *weight,\n" + " __global FLT4 *bias,\n" + " __write_only image2d_t output)\n" + "{\n"; + + code += + " int oh = get_global_id(0); // [0, OH)\n" + " int ow = get_global_id(1); // [0, OW)\n" + " int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n" + "\n" + " if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" + " return;\n" + "\n" + " FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " __global FLT4 *w0_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;\n"; + + code += + " for (int kh = 0; kh < KH; ++kh)\n" + " {\n" + " int ih = kh + oh * strideH - padTop;\n" + " for (int kw = 0; kw < KW; ++kw)\n" + " {\n" + " int iw = kw + ow * strideW - padLeft;\n" + " if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)\n" + " {\n" + " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" + " {\n"; + + // NHWC4 NHC4W4 NC4HW4 + code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n"; + // code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ih * CI_SLICES + ci_slice)); // NHC4W4: HC W\n\n"; + // code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ci_slice * IH + ih)); // NC4HW4: CH W\n\n"; + + code += + " out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n" + " out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n" + " out0_c4 += w0_ic1_oc4[2] * in_c4.z;\n" + " out0_c4 += w0_ic1_oc4[3] * in_c4.w;\n" + " w0_ic1_oc4 += 4;\n" + " }\n" + " }\n" + " else\n" + " {\n" + " w0_ic1_oc4 += 4 * CI_SLICES;\n" + " }\n" + " }\n" + " }\n\n"; + code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; + + // NHWC4 NHC4W4 NC4HW4 + if (OW * CO_SLICES < 65536) { + code += " WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; + } else { + code += " WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; + } + // code += " WRITE_FLT4(output, (int2)(ow, oh * CO_SLICES + co_slice), out0_c4_bias);// NHC4W4: HC W\n}"; + // code += " WRITE_FLT4(output, (int2)(ow ,co_slice * OH + oh), out0_c4_bias);// NC4HW4: CH W\n}"; - return 0; + // std::cout << code << std::endl; + return code; } + int ConvolutionOpenCLKernel::InitBuffer() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); + // weight: OHWI -> OHWIIO auto weight_tensor = inputs_[1]; - auto bias_tensor = inputs_[2]; - if (io_dataformat_ == schema::Format_NHWC) { - packed_weight_ = reinterpret_cast(allocator->Malloc(weight_tensor->Size())); - packed_weight_ = reinterpret_cast(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); - memcpy(packed_weight_, weight_tensor->Data(), weight_tensor->Size()); - allocator->UnmapBuffer(packed_weight_); - - packed_bias_ = reinterpret_cast(allocator->Malloc(bias_tensor->Size())); - packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); - memcpy(packed_bias_, bias_tensor->Data(), bias_tensor->Size()); - allocator->UnmapBuffer(packed_bias_); - } else if (io_dataformat_ == schema::Format_NHWC4) { - // OHWI -> OHWIIO - auto weight_shape = weight_tensor->shape(); - size_t CO = weight_shape[0]; - size_t KH = weight_shape[1]; - size_t KW = weight_shape[2]; - size_t CI = weight_shape[3]; - size_t CI_SLICES = UP_DIV(CI, C4NUM); - size_t CO_SLICES = UP_DIV(CO, C4NUM); - constexpr size_t CI_TILE = C4NUM; - constexpr size_t CO_TILE = C4NUM; - size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); - - packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); - packed_weight_ = reinterpret_cast(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); - memset(packed_weight_, 0x00, packed_weight_size); - auto weight_data = reinterpret_cast(weight_tensor->Data()); - for (int co = 0; co < CO; ++co) { - for (int kh = 0; kh < KH; ++kh) { - for (int kw = 0; kw < KW; ++kw) { - for (int ci = 0; ci < CI; ++ci) { - auto co_outer = co / CO_TILE; - auto co_inner = co % CO_TILE; - auto ci_outer = ci / CI_TILE; - auto ci_inner = ci % CI_TILE; - packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + - co_inner] = *(weight_data++); - } + auto weight_shape = weight_tensor->shape(); + size_t CO = weight_shape[0]; + size_t KH = weight_shape[1]; + size_t KW = weight_shape[2]; + size_t CI = weight_shape[3]; + size_t CI_SLICES = UP_DIV(CI, C4NUM); + size_t CO_SLICES = UP_DIV(CO, C4NUM); + constexpr size_t CI_TILE = C4NUM; + constexpr size_t CO_TILE = C4NUM; + size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); + packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); + packed_weight_ = reinterpret_cast(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); + memset(packed_weight_, 0x00, packed_weight_size); + auto weight_data = reinterpret_cast(weight_tensor->Data()); + for (int co = 0; co < CO; ++co) { + for (int kh = 0; kh < KH; ++kh) { + for (int kw = 0; kw < KW; ++kw) { + for (int ci = 0; ci < CI; ++ci) { + auto co_outer = co / CO_TILE; + auto co_inner = co % CO_TILE; + auto ci_outer = ci / CI_TILE; + auto ci_inner = ci % CI_TILE; + packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + + co_inner] = *(weight_data++); } } } - allocator->UnmapBuffer(packed_weight_); - size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); - packed_bias_ = reinterpret_cast(allocator->Malloc(packed_bias_size)); - packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); - memset(packed_bias_, 0x00, packed_bias_size); - auto bias_data = reinterpret_cast(bias_tensor->Data()); - for (int co = 0; co < CO; ++co) { - packed_bias_[co] = bias_data[co]; - } - allocator->UnmapBuffer(packed_bias_); } + allocator->UnmapBuffer(packed_weight_); - return 0; -} // namespace mindspore::kernel + // align bias + auto bias_tensor = inputs_[2]; + size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); + packed_bias_ = reinterpret_cast(allocator->Malloc(packed_bias_size)); + packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); + memset(packed_bias_, 0x00, packed_bias_size); + auto bias_data = reinterpret_cast(bias_tensor->Data()); + for (int co = 0; co < CO; ++co) { + packed_bias_[co] = bias_data[co]; + } + allocator->UnmapBuffer(packed_bias_); -int ConvolutionOpenCLKernel::ReSize() { return 0; } + return RET_OK; +} // namespace mindspore::kernel static int GetBiggestDivider(int x, int y) { for (int i = y; i != 0; i--) { @@ -130,18 +225,19 @@ static int GetBiggestDivider(int x, int y) { return 1; } -static void GetLocalSize(const ConvParameter *param, std::vector *global, std::vector *local) { +int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::vector *local) { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto param = reinterpret_cast(opParameter); + constexpr size_t work_group_size[] = {4, 4, 1}; - constexpr size_t max_work_item_sizes[] = {512, 512, 512}; - constexpr size_t max_work_group_size = 512; + auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); + size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())()); const size_t max_z_size = std::min(16, max_work_item_sizes[2]); - // 先用OH OW CO_SLICES初始化global,并且441对齐 size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0]; size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1]; size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2]; - // 使用策略计算local size_t local_c = GetBiggestDivider(global_c, max_z_size); size_t local_hw_size = std::min(256, max_work_group_size) / local_c; size_t local_w = std::min(global_w, local_hw_size); @@ -158,62 +254,53 @@ static void GetLocalSize(const ConvParameter *param, std::vector *global local->push_back(local_h); local->push_back(local_w); local->push_back(local_c); + return RET_OK; +} + +int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t CO_SLICES = UP_DIV(outputs_[0]->Channel(), C4NUM); + size_t im_dst_x, im_dst_y; + if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { + if (outputs_[0]->Width() * CO_SLICES < 65536) { + { + im_dst_x = outputs_[0]->Width() * CO_SLICES; + im_dst_y = outputs_[0]->Height(); + } + } else { + im_dst_x = outputs_[0]->Height() * CO_SLICES; + im_dst_y = outputs_[0]->Width(); + } + } else { + im_dst_y = outputs_[0]->Height() * CO_SLICES; + im_dst_x = outputs_[0]->Width(); + } +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + img_size->clear(); + img_size->push_back(im_dst_x); + img_size->push_back(im_dst_y); + img_size->push_back(img_dtype); + return RET_OK; } int ConvolutionOpenCLKernel::Run() { - MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()"; + std::cout << "ConvolutionOpenCLKernel::Run()\n"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - auto param = reinterpret_cast(opParameter); - auto input0_shape = inputs_[0]->shape(); // NHWC - auto input1_shape = inputs_[1]->shape(); // OHWI - auto outpu0_shape = outputs_[0]->shape(); // NHWC - cl_int N = input0_shape[0]; - cl_int CI = input0_shape[3]; - cl_int IH = input0_shape[1]; - cl_int IW = input0_shape[2]; - cl_int CO = outpu0_shape[3]; - cl_int OH = outpu0_shape[1]; - cl_int OW = outpu0_shape[2]; - cl_int KH = input1_shape[1]; - cl_int KW = input1_shape[2]; - cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; - cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; - - cl_int4 input_shape; - cl_int4 output_shape; - if (io_dataformat_ == schema::Format_NHWC) { - input_shape = {N, IH, IW, CI}; - output_shape = {N, OH, OW, CO}; - } else if (io_dataformat_ == schema::Format_NHWC4) { - input_shape = {N, IH, IW, CI_ALIGN}; - output_shape = {N, OH, OW, CO_ALIGN}; - } - cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_}; - cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; - int arg_cn = 0; ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_weight_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad); std::vector global; std::vector local; - GetLocalSize(reinterpret_cast(this->opParameter), &global, &local); - // float8 per thread - if (io_dataformat_ == schema::Format_NHWC4) { - local[2] = UP_DIV(local[2], 2); - global[2] = UP_DIV(global[2], 2); - global[2] = UP_DIV(global[2], global[2]) * global[2]; - } + GetGlobalLocal(&global, &local); ocl_runtime->RunKernel(kernel_, global, local, nullptr); - - return 0; + return RET_OK; } kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector &inputs, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index f757b1968a..d9da43df49 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -18,6 +18,7 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_ #include +#include #include "src/ir/tensor.h" #include "src/runtime/kernel/opencl/opencl_kernel.h" #include "schema/model_generated.h" @@ -26,23 +27,25 @@ namespace mindspore::kernel { -class ConvolutionOpenCLKernel : public LiteKernel { +class ConvolutionOpenCLKernel : public OpenCLKernel { public: explicit ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs) - : LiteKernel(parameter, inputs, outputs) {} + : OpenCLKernel(parameter, inputs, outputs) {} ~ConvolutionOpenCLKernel() override{}; int Init() override; - int ReSize() override; int Run() override; int InitBuffer(); + int GetImageSize(size_t idx, std::vector *img_size) override; private: - schema::Format io_dataformat_ = schema::Format_NHWC4; float *packed_weight_ = nullptr; float *packed_bias_ = nullptr; cl::Kernel kernel_; + + std::string CodeGen(); + int GetGlobalLocal(std::vector *global, std::vector *local); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index 8a9c47ce4a..f8f1115ec8 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -297,6 +297,7 @@ if (SUPPORT_GPU) ${TEST_DIR}/ut/src/runtime/kernel/opencl/utils_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc + ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc ) endif() diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc new file mode 100644 index 0000000000..86efc7886d --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -0,0 +1,179 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "utils/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/arm/nnacl/pack.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h" + +using mindspore::kernel::ConvolutionOpenCLKernel; +using mindspore::kernel::LiteKernel; +using mindspore::kernel::SubGraphOpenCLKernel; + +namespace mindspore { + +class TestConvolutionOpenCL : public mindspore::Common {}; + +void LoadData(void *dst, size_t dst_size, const std::string &file_path) { + if (file_path.empty()) { + memset(dst, 0x00, dst_size); + } else { + auto src_data = reinterpret_cast(mindspore::lite::ReadFile(file_path.c_str(), &dst_size)); + memcpy(dst, src_data, dst_size); + } +} + +void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) { + auto *output_data = reinterpret_cast(output_tensor->Data()); + printf("output[0:10]:"); + for (int i = 0; i < 10; i++) { + printf("%d:%.3f ", i, output_data[i]); + } + printf("\n"); + + size_t output_size = output_tensor->Size(); + auto expect_data = reinterpret_cast(mindspore::lite::ReadFile(file_path.c_str(), &output_size)); + constexpr float atol = 0.5; + for (int i = 0; i < output_tensor->ElementsNum(); ++i) { + if (std::fabs(output_data[i] - expect_data[i]) > atol) { + printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]); + printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]); + printf("error at idx[%d] expect=%.3f output=%.3f\n\n\n", i, expect_data[i], output_data[i]); + return; + } + } + printf("compare success!\n"); + printf("compare success!\n"); + printf("compare success!\n\n\n"); +} + +void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::string &input_file, + const std::string &weight_file, const std::string &bias_file, const std::string &expect_file) { + assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4); + + std::cout << "initialize OpenCLRuntime"; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->Init(); + auto allocator = ocl_runtime->GetAllocator(); + + std::cout << "create inputs/weights/outputs Tensors(framework do)"; + std::vector input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_}; + std::vector weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_}; + std::vector bias_shape = {param->output_channel_}; + std::vector output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; + auto data_type = kNumberTypeFloat32; + auto tensorType = schema::NodeType_ValueNode; + auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, data_format, tensorType); + auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType); + auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType); + auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, data_format, tensorType); + std::vector inputs{input_tensor, weight_tensor, bias_tensor}; + std::vector outputs{output_tensor}; + + std::cout << "initialize weight Tensors data(framework do)"; + std::vector weight_vec(weight_tensor->ElementsNum()); + std::vector bias_vec(weight_tensor->ElementsNum()); + weight_tensor->SetData(weight_vec.data()); + bias_tensor->SetData(bias_vec.data()); + LoadData(weight_tensor->Data(), weight_tensor->Size(), weight_file); + LoadData(bias_tensor->Data(), bias_tensor->Size(), bias_file); + + std::cout << "create OpenCL Kernel"; // weight has been allcated by framework + auto *conv_kernel = new ConvolutionOpenCLKernel(reinterpret_cast(param), inputs, outputs); + conv_kernel->Init(); + std::vector kernels{conv_kernel}; + + // freamework to do!!! allocate memory by hand + inputs[0]->MallocData(allocator); + + std::cout << "create SubGraphOpenCLKernel"; + auto *sub_graph = new SubGraphOpenCLKernel({input_tensor}, outputs, kernels, kernels, kernels); + sub_graph->Init(); + + std::cout << "initialize input Tensors data"; // inputs has been allcated by sub_graph->Init() + LoadData(input_tensor->Data(), input_tensor->Size(), input_file); + printf("input[0] =%.3f\n", reinterpret_cast(input_tensor->Data())[0]); + printf("weight[0]=%.3f\n", reinterpret_cast(weight_tensor->Data())[0]); + printf("bias[0] =%.3f\n", reinterpret_cast(bias_tensor->Data())[0]); + + std::cout << "sub_graph->Run()"; + sub_graph->Run(); + printf("output_tensor->Size() =%zu\n", output_tensor->Size()); + + std::cout << "compare result"; + MyCompareOutput(output_tensor, expect_file); + // lite::CompareOutput(reinterpret_cast(output_tensor->Data()), expect_file); + + mindspore::lite::opencl::OpenCLRuntime::DeleteInstance(); +} + +std::array GenFilenames(ConvParameter *param, schema::Format data_format, const std::string &path) { + auto full_path = path + "inputNHWC_" + std::to_string(param->input_batch_) + "x" + std::to_string(param->input_h_) + + "x" + std::to_string(param->input_w_) + "x" + std::to_string(param->input_channel_) + + "_outputNHWC_" + std::to_string(param->output_batch_) + "x" + std::to_string(param->output_h_) + + "x" + std::to_string(param->output_w_) + "x" + std::to_string(param->output_channel_) + + "_kernelHW_" + std::to_string(param->kernel_h_) + "x" + std::to_string(param->kernel_w_) + + "_strideHW_" + std::to_string(param->stride_h_) + "x" + std::to_string(param->stride_w_) + + "_padTopBottomLeftRight_" + std::to_string(param->pad_u_) + "x" + std::to_string(param->pad_d_) + + "x" + std::to_string(param->pad_l_) + "x" + std::to_string(param->pad_r_) + "_dilationHW_1x1/"; + + if (data_format == schema::Format_NHWC4) { + return std::array{full_path + "input_NHWC4.bin", full_path + "weight_OHWI.bin", + full_path + "bias_C4.bin", full_path + "expect_NHWC4.bin"}; + } else { + return std::array{full_path + "input_NHWC.bin", full_path + "weight_OHWI.bin", + full_path + "bias_C.bin", full_path + "expect_NHWC.bin"}; + } +} + +TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { + auto param = new ConvParameter; + param->input_batch_ = 1, param->input_h_ = 224, param->input_w_ = 224, param->input_channel_ = 3; + param->output_batch_ = 1, param->output_h_ = 112, param->output_w_ = 112, param->output_channel_ = 32; + param->kernel_h_ = 3, param->kernel_w_ = 3; + param->stride_h_ = 2, param->stride_w_ = 2; + param->pad_u_ = 0, param->pad_d_ = 1, param->pad_l_ = 0, param->pad_r_ = 1; + + auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/"); + // std::cout << filenames[0] << std::endl; + // std::cout << filenames[1] << std::endl; + // std::cout << filenames[2] << std::endl; + // std::cout << filenames[3] << std::endl; + TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]); + lite::opencl::OpenCLRuntime::DeleteInstance(); +} + +TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { + auto param = new ConvParameter; + param->input_batch_ = 1, param->input_h_ = 1, param->input_w_ = 64, param->input_channel_ = 512; + param->output_batch_ = 1, param->output_h_ = 1, param->output_w_ = 64, param->output_channel_ = 7358; + param->kernel_h_ = 1, param->kernel_w_ = 1; + param->stride_h_ = 1, param->stride_w_ = 1; + param->pad_u_ = 0, param->pad_d_ = 0, param->pad_l_ = 0, param->pad_r_ = 0; + + auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/02_fp32/"); + // std::cout << filenames[0] << std::endl; + // std::cout << filenames[1] << std::endl; + // std::cout << filenames[2] << std::endl; + // std::cout << filenames[3] << std::endl; + TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]); + lite::opencl::OpenCLRuntime::DeleteInstance(); +} + +} // namespace mindspore