From 8904f0cf42f98aaf4317924bbc00849b8d236df1 Mon Sep 17 00:00:00 2001 From: wandongdong Date: Sat, 8 Aug 2020 15:23:17 +0800 Subject: [PATCH] fix pool kernel name --- .../kernel/opencl/cl/fp32/avg_pool2d.cl | 4 ++-- .../kernel/opencl/kernel/arithmetic.cc | 2 -- .../runtime/kernel/opencl/kernel/concat.cc | 10 ++++---- .../kernel/opencl/kernel/convolution.cc | 5 +++- .../kernel/opencl/subgraph_opencl_kernel.cc | 4 ++-- .../src/runtime/opencl/opencl_allocator.cc | 4 ++-- .../src/runtime/opencl/opencl_executor.cc | 2 +- mindspore/lite/src/scheduler.cc | 24 +++++++++++++++---- 8 files changed, 35 insertions(+), 20 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl index 0e60a4ca1e..891b2fcf51 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl @@ -1,4 +1,4 @@ -__kernel void AvgPooling2d(__global float4 *input, __global float4 *output, const int4 input_shape, +__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); @@ -31,7 +31,7 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, +__kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index c43c410eb0..96ee4b37be 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -148,8 +148,6 @@ int ArithmeticOpenCLKernel::Run() { } runtime_->SetKernelArg(kernel_, arg_idx++, weight_); runtime_->SetKernelArg(kernel_, arg_idx++, bias_); - MS_LOG(DEBUG) << arg_idx-2 << " " << weight_; - MS_LOG(DEBUG) << arg_idx-1 << " " << bias_; } runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); int H = outputs_[0]->Batch() * outputs_[0]->Height(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index 3226d1bc44..536bd84b0d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -147,21 +147,18 @@ void ConcatGetWorkGroup(const std::vector &global, std::vector * local->push_back(z); } int ConcatOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->Name() << " Running!"; auto param = reinterpret_cast(this->opParameter); if (param->axis_ == 0) { return Run_axis0(); } auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - MS_LOG(INFO) << " judge the numbers of input vector"; auto input0_shape = inputs_[0]->shape(); auto input1_shape = inputs_[1]->shape(); - auto input2_shape = inputs_[2]->shape(); auto output_shape = outputs_[0]->shape(); cl_int2 input0_shape2_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4)}; // change - cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4), - DivideRoundUp(input2_shape[3], 4)}; cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], DivideRoundUp(output_shape[3], 4)}; uint32_t OH = output_shape[0] * output_shape[1]; // N*H @@ -173,14 +170,15 @@ int ConcatOpenCLKernel::Run() { int arg_cn = 0; if (inputs_.size() == 2) { - MS_LOG(INFO) << " SetKernelArg"; ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); } else if (inputs_.size() == 3) { - MS_LOG(INFO) << " SetKernelArg"; + auto input2_shape = inputs_[2]->shape(); + cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4), + DivideRoundUp(input2_shape[3], 4)}; ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index ec9715f1ff..9fc71ba389 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -228,6 +228,9 @@ static int GetBiggestDivider(int x, int y) { int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::vector *local) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto param = reinterpret_cast(opParameter); + param->output_h_ = outputs_[0]->Height(); + param->output_w_ = outputs_[0]->Width(); + param->output_channel_ = outputs_[0]->Channel(); constexpr size_t work_group_size[] = {4, 4, 1}; auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); @@ -287,7 +290,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_s } int ConvolutionOpenCLKernel::Run() { - std::cout << "ConvolutionOpenCLKernel::Run()\n"; + MS_LOG(DEBUG) << this->Name() << " Running!"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); int arg_cn = 0; diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index dcd2fe8943..7c495cf68f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -24,6 +24,7 @@ SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } int SubGraphOpenCLKernel::Init() { allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); + MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size(); for (const auto tensor : inputs_) { tensor->set_allocator(allocator_); } @@ -38,8 +39,7 @@ int SubGraphOpenCLKernel::Init() { data = allocator_->MapBuffer(data, CL_MAP_WRITE, nullptr, true); tensor->SetData(data); } else { - MS_LOG(ERROR) << "OpenCL kernel must use GPU buffer pointer, " - << "please make sure that this buffer allocate by OpenCLAllocator!"; + MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; } } return 0; diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index 588a347464..649446cbf4 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -109,7 +109,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector& img_size) auto mem_buf = iter->second; bool is_match{mem_buf->img_size.size() == img_size.size()}; for (int i = 0; i < img_size.size() && is_match; ++i) { - is_match = img_size[i] == mem_buf->img_size[i]; + is_match &= img_size[i] == mem_buf->img_size[i]; } if (is_match) { free_list_.erase(iter); @@ -166,7 +166,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v auto mem_buf = iter->second; bool is_match{mem_buf->img_size.size() == img_size.size()}; for (int i = 0; i < img_size.size() && is_match; ++i) { - is_match = img_size[i] == mem_buf->img_size[i]; + is_match &= img_size[i] == mem_buf->img_size[i]; } if (is_match) { free_list_.erase(iter); diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index be273ea523..a63d77e1da 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -168,6 +168,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format) { if (dst_format == schema::Format_NHWC4) { + tensor->SetFormat(schema::Format_NHWC4); // convert to nhwc4 auto *src_data = tensor->Data(); auto *dst_data{src_data}; @@ -190,7 +191,6 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema: dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size); tensor->SetData(dst_data); allocator_->Free(src_data); - tensor->SetFormat(schema::Format_NHWC4); return RET_OK; } else { MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to " diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index 80c05c2720..f4b2162352 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -142,10 +142,26 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels); - std::vector output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels); - std::vector input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels); - std::vector output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels); + auto head_kernel = kernels.front(); + auto tail_kernel = kernels.back(); + std::vector input_kernels{head_kernel}; + std::vector output_kernels{tail_kernel}; + std::vector input_tensors; + std::vector output_tensors; + for (auto tensor : head_kernel->GetInputs()) { + if (tensor->Data() == nullptr) { + input_tensors.emplace_back(tensor); + } + } + for (auto tensor : tail_kernel->GetInputs()) { + if (tensor->Data() == nullptr) { + output_tensors.emplace_back(tensor); + } + } +// std::vector input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels); +// std::vector output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels); +// std::vector input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels); +// std::vector output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels); sub_kernel = new kernel::SubGraphOpenCLKernel(input_tensors, output_tensors, input_kernels, output_kernels, kernels); sub_kernel->Init();