From 8904f0cf42f98aaf4317924bbc00849b8d236df1 Mon Sep 17 00:00:00 2001
From: wandongdong <wandongdong1@huawei.com>
Date: Sat, 8 Aug 2020 15:23:17 +0800
Subject: [PATCH] fix pool kernel name

---
 .../kernel/opencl/cl/fp32/avg_pool2d.cl       |  4 ++--
 .../kernel/opencl/kernel/arithmetic.cc        |  2 --
 .../runtime/kernel/opencl/kernel/concat.cc    | 10 ++++----
 .../kernel/opencl/kernel/convolution.cc       |  5 +++-
 .../kernel/opencl/subgraph_opencl_kernel.cc   |  4 ++--
 .../src/runtime/opencl/opencl_allocator.cc    |  4 ++--
 .../src/runtime/opencl/opencl_executor.cc     |  2 +-
 mindspore/lite/src/scheduler.cc               | 24 +++++++++++++++----
 8 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
index 0e60a4ca1e..891b2fcf51 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
@@ -1,4 +1,4 @@
-__kernel void AvgPooling2d(__global float4 *input, __global float4 *output, const int4 input_shape,
+__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
                            const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
   // axis to dst tensor coordinate
   int X = get_global_id(0);
@@ -31,7 +31,7 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons
 
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-__kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
+__kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                                   const int4 output_shape, const int2 stride, const int2 kernel_size,
                                   const int2 padding) {
   // axis to dst tensor coordinate
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index c43c410eb0..96ee4b37be 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -148,8 +148,6 @@ int ArithmeticOpenCLKernel::Run() {
     }
     runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
     runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
-    MS_LOG(DEBUG) << arg_idx-2 << " " << weight_;
-    MS_LOG(DEBUG) << arg_idx-1 << " " << bias_;
   }
   runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
   int H = outputs_[0]->Batch() * outputs_[0]->Height();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 3226d1bc44..536bd84b0d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -147,21 +147,18 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
   local->push_back(z);
 }
 int ConcatOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->Name() << " Running!";
   auto param = reinterpret_cast<ConcatParameter *>(this->opParameter);
   if (param->axis_ == 0) {
     return Run_axis0();
   }
 
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  MS_LOG(INFO) << " judge the numbers of input vector";
   auto input0_shape = inputs_[0]->shape();
   auto input1_shape = inputs_[1]->shape();
-  auto input2_shape = inputs_[2]->shape();
   auto output_shape = outputs_[0]->shape();
 
   cl_int2 input0_shape2_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4)};  // change
-  cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4),
-                            DivideRoundUp(input2_shape[3], 4)};
   cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], DivideRoundUp(output_shape[3], 4)};
 
   uint32_t OH = output_shape[0] * output_shape[1];  // N*H
@@ -173,14 +170,15 @@ int ConcatOpenCLKernel::Run() {
 
   int arg_cn = 0;
   if (inputs_.size() == 2) {
-    MS_LOG(INFO) << " SetKernelArg";
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_);
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
   } else if (inputs_.size() == 3) {
-    MS_LOG(INFO) << " SetKernelArg";
+    auto input2_shape = inputs_[2]->shape();
+    cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4),
+                              DivideRoundUp(input2_shape[3], 4)};
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data());
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index ec9715f1ff..9fc71ba389 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -228,6 +228,9 @@ static int GetBiggestDivider(int x, int y) {
 int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto param = reinterpret_cast<ConvParameter *>(opParameter);
+  param->output_h_ = outputs_[0]->Height();
+  param->output_w_ = outputs_[0]->Width();
+  param->output_channel_ = outputs_[0]->Channel();
 
   constexpr size_t work_group_size[] = {4, 4, 1};
   auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
@@ -287,7 +290,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s
 }
 
 int ConvolutionOpenCLKernel::Run() {
-  std::cout << "ConvolutionOpenCLKernel::Run()\n";
+  MS_LOG(DEBUG) << this->Name() << " Running!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
 
   int arg_cn = 0;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index dcd2fe8943..7c495cf68f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -24,6 +24,7 @@ SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }
 
 int SubGraphOpenCLKernel::Init() {
   allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+  MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size();
   for (const auto tensor : inputs_) {
     tensor->set_allocator(allocator_);
   }
@@ -38,8 +39,7 @@ int SubGraphOpenCLKernel::Init() {
       data = allocator_->MapBuffer(data, CL_MAP_WRITE, nullptr, true);
       tensor->SetData(data);
     } else {
-      MS_LOG(ERROR) << "OpenCL kernel must use GPU buffer pointer, "
-                    << "please make sure that this buffer allocate by OpenCLAllocator!";
+      MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!";
     }
   }
   return 0;
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
index 588a347464..649446cbf4 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -109,7 +109,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size)
     auto mem_buf = iter->second;
     bool is_match{mem_buf->img_size.size() == img_size.size()};
     for (int i = 0; i < img_size.size() && is_match; ++i) {
-      is_match = img_size[i] == mem_buf->img_size[i];
+      is_match &= img_size[i] == mem_buf->img_size[i];
     }
     if (is_match) {
       free_list_.erase(iter);
@@ -166,7 +166,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
     auto mem_buf = iter->second;
     bool is_match{mem_buf->img_size.size() == img_size.size()};
     for (int i = 0; i < img_size.size() && is_match; ++i) {
-      is_match = img_size[i] == mem_buf->img_size[i];
+      is_match &= img_size[i] == mem_buf->img_size[i];
     }
     if (is_match) {
       free_list_.erase(iter);
diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
index be273ea523..a63d77e1da 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
@@ -168,6 +168,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema
 int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format,
     schema::Format dst_format) {
   if (dst_format == schema::Format_NHWC4) {
+    tensor->SetFormat(schema::Format_NHWC4);
     // convert to nhwc4
     auto *src_data = tensor->Data();
     auto *dst_data{src_data};
@@ -190,7 +191,6 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema:
     dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size);
     tensor->SetData(dst_data);
     allocator_->Free(src_data);
-    tensor->SetFormat(schema::Format_NHWC4);
     return RET_OK;
   } else {
     MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index 80c05c2720..f4b2162352 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -142,10 +142,26 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector<kernel::LiteKer
   kernel::LiteKernel *sub_kernel = nullptr;
 #if SUPPORT_GPU
   if (arch == kernel::KERNEL_ARCH::kGPU) {
-    std::vector<tensor::Tensor *> input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels);
-    std::vector<tensor::Tensor *> output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels);
-    std::vector<kernel::LiteKernel *> input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels);
-    std::vector<kernel::LiteKernel *> output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels);
+    auto head_kernel = kernels.front();
+    auto tail_kernel = kernels.back();
+    std::vector<kernel::LiteKernel *> input_kernels{head_kernel};
+    std::vector<kernel::LiteKernel *> output_kernels{tail_kernel};
+    std::vector<tensor::Tensor *> input_tensors;
+    std::vector<tensor::Tensor *> output_tensors;
+    for (auto tensor : head_kernel->GetInputs()) {
+      if (tensor->Data() == nullptr) {
+        input_tensors.emplace_back(tensor);
+      }
+    }
+    for (auto tensor : tail_kernel->GetInputs()) {
+      if (tensor->Data() == nullptr) {
+        output_tensors.emplace_back(tensor);
+      }
+    }
+//    std::vector<tensor::Tensor *> input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels);
+//    std::vector<tensor::Tensor *> output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels);
+//    std::vector<kernel::LiteKernel *> input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels);
+//    std::vector<kernel::LiteKernel *> output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels);
     sub_kernel =
       new kernel::SubGraphOpenCLKernel(input_tensors, output_tensors, input_kernels, output_kernels, kernels);
     sub_kernel->Init();