fix opencl infershape bug

4 years ago · d4f49b3a10
parent 7324ee14c6
commit d4f49b3a10
16 changed files with 203 additions and 545 deletions
--- a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc
@ -68,12 +68,14 @@ void *OpenCLAllocator::MinimumFit(MemType mem_type, size_t size, const ImageSize
 void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) {
  cl_int ret = CL_SUCCESS;
  MS_ASSERT(buffer);
+  MS_ASSERT(size > 0);
  *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret);
  if (*buffer == nullptr) {
    MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
    return nullptr;
  }
  void *host_ptr = ocl_runtime_->MapBuffer(**buffer, CL_MAP_READ | CL_MAP_WRITE, size);
+  MS_ASSERT(host_ptr);
  if (host_ptr == nullptr) {
    delete *buffer;
    MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr;
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@ -44,6 +44,19 @@ using mindspore::schema::PrimitiveType_Eltwise;
 namespace mindspore::kernel {

 int ArithmeticOpenCLKernel::CheckSpecs() {
+  for (auto &tensor : in_tensors_) {
+    if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
+      MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 input";
+      return RET_ERROR;
+    }
+  }
+  for (auto &tensor : out_tensors_) {
+    if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
+      MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 output";
+      return RET_ERROR;
+    }
+  }
+
  if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@ -286,6 +286,7 @@ void Conv2DOpenCLKernel::InitFilter() {
  }

  FreeDequantedWeight();
+  FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c());
 }

 void Conv2DOpenCLKernel::InitBias() {
@ -322,6 +323,7 @@ void Conv2DOpenCLKernel::InitBias() {
    }
  }
  allocator->UnmapBuffer(packed_bias_);
+  FreeTmpWeight(in_tensors_.at(kBiasIndex)->data_c());
 }

 void Conv2DOpenCLKernel::SetConstArgs() {
@ -480,11 +482,9 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
  MS_ASSERT(!inputs.empty());
  MS_ASSERT(!outputs.empty());
  MS_ASSERT(opParameter);
-  MS_ASSERT(inputs.front()->shape().size() == 4);
-  MS_ASSERT(outputs.front()->shape().size() == 4);
  auto *conv_param = reinterpret_cast<ConvParameter *>(opParameter);
-  int input_channel = inputs.front()->shape().at(3);
-  int output_channel = outputs.front()->shape().at(3);
+  int input_channel = conv_param->input_channel_;
+  int output_channel = conv_param->output_channel_;
  int group = conv_param->group_;

  // case 1: depthwise conv2d
@ -529,6 +529,10 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
    }
  }
  if (!infer_shape_done) {
+    StoreTmpWeight(inputs.at(kWeightIndex));
+    if (inputs.size() > kBiasIndex) {
+      StoreTmpWeight(inputs.at(kBiasIndex));
+    }
    MS_LOG(WARNING) << "kernel don't infer shape yet!";
    return kernel;
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@ -97,10 +97,11 @@ int ReshapeOpenCLKernel::Run() {
 }

 int ReshapeOpenCLKernel::PreProcess() {
-  if (Type() == PrimitiveType_Reshape && !infer_shape_flag_) {
+  if (Type() == PrimitiveType_Reshape && !op_parameter_->infer_flag_) {
    auto shape_tensor = in_tensors_[1];
    if (!shape_tensor->IsConst()) {
      ocl_runtime_->SyncCommandQueue();
+      shape_tensor->MutableData();
    }
  }
  return OpenCLKernel::PreProcess();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc
@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_Resize;
 namespace mindspore::kernel {

 int ResizeOpenCLKernel::CheckSpecs() {
-  if (in_tensors_.size() != 1 || out_tensors_.size() != 1) {
+  if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
    return RET_ERROR;
  }
@ -119,6 +119,17 @@ int ResizeOpenCLKernel::Run() {
  return RET_OK;
 }

+int ResizeOpenCLKernel::PreProcess() {
+  if (Type() == PrimitiveType_Resize && !op_parameter_->infer_flag_) {
+    auto shape_tensor = in_tensors_[1];
+    if (!shape_tensor->IsConst()) {
+      ocl_runtime_->SyncCommandQueue();
+      shape_tensor->MutableData();
+    }
+  }
+  return OpenCLKernel::PreProcess();
+}
+
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h
@ -34,6 +34,7 @@ class ResizeOpenCLKernel : public OpenCLKernel {
  int CheckSpecs() override;
  void SetConstArgs() override;
  void SetGlobalLocal() override;
+  int PreProcess() override;

 private:
  float getResizeScaleFactor(int input_size, int output_size);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@ -71,8 +71,20 @@ int StackOpenCLKernel::CheckSpecs() {
    MS_LOG(ERROR) << " only support input size = 2 and output size = 1";
    return RET_ERROR;
  }
+  for (auto &tensor : in_tensors_) {
+    if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
+      MS_LOG(ERROR) << " only support fp32/fp16 input";
+      return RET_ERROR;
+    }
+  }
+  for (auto &tensor : out_tensors_) {
+    if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
+      MS_LOG(ERROR) << " only support fp32/fp16 output";
+      return RET_ERROR;
+    }
+  }
  if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
-    MS_LOG(ERROR) << " only support dim <= 4 ";
+    MS_LOG(ERROR) << " only support 0<dim<=4";
    return RET_ERROR;
  }
  axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@ -64,32 +64,22 @@ void ToFormatOpenCLKernel::SetGlobalLocal() {
 }

 int ToFormatOpenCLKernel::Prepare() {
-  std::map<TypeId, std::string> dtype_str{
-    {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt32, "float"}};
-  std::string kernel_name;
-  if (out_mem_type_ == MemType::IMG) {
-    kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_.front()->data_type()];
-  } else {
-    kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_.front()->data_type()];
-  }
+  static std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float32"},
+                                                 {kNumberTypeFloat16, "float16"},
+                                                 {kNumberTypeInt32, "int32"},
+                                                 {kNumberTypeUInt32, "uint32"}};
+  auto in_tensor = in_tensors_.front();
+  auto out_tensor = out_tensors_.front();
+  std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_";
+  kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()];
  this->set_name(kernel_name);

-#ifdef PROGRAM_WITH_IL
-  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
-#else
  std::string program_name = "to_format";
  std::string source = to_format_source;
  ocl_runtime_->LoadSource(program_name, source);
-  std::vector<std::string> ext_build_opt;
-  if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
-    ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
-  } else {
-    ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
-  }
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
-#endif
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);

-  auto output = GpuTensorInfo(out_tensors_.front());
+  auto output = GpuTensorInfo(out_tensor);
  N_ = output.N;
  H_ = output.H;
  W_ = output.W;
@ -112,15 +102,8 @@ int ToFormatOpenCLKernel::Run() {
 }

 int ToFormatOpenCLKernel::InferShape() {
-  if (infer_shape_flag_) {
-    return RET_OK;
-  }
-  if (in_tensors_[0]->shape().size() == 0 || in_tensors_[0]->ElementsNum() < 0) {
-    MS_LOG(ERROR) << "to_format op in tensor shape is 0, infer shape failed!";
-    return RET_ERROR;
-  }
  out_tensors_[0]->set_shape(in_tensors_[0]->shape());
-  infer_shape_flag_ = true;
+  op_parameter_->infer_flag_ = false;
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
@ -97,7 +97,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
 }

 void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
-  printf("%-30s", name().c_str());
+  printf("%-30s ", name().c_str());
  if (out_tensors().empty()) {
    return;
  }
@ -134,7 +134,9 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {

  auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
  for (int i = 0; i < print_num && i < total_num; ++i) {
-    if (tensor->data_type() == kNumberTypeFloat16) {
+    if (tensor->data_type() == kNumberTypeInt32) {
+      printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
+    } else if (tensor->data_type() == kNumberTypeFloat16) {
      printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
    } else {
      printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
@ -191,7 +193,7 @@ int OpenCLKernel::PostProcess() {
 }

 int OpenCLKernel::InferShape() {
-  if (infer_shape_flag_) {
+  if (op_parameter_->infer_flag_) {
    return RET_OK;
  }
  op_parameter_->infer_flag_ = true;
@ -202,12 +204,11 @@ int OpenCLKernel::InferShape() {
    op_parameter_->infer_flag_ = false;
    return ret;
  }
-  infer_shape_flag_ = true;
  return RET_OK;
 }

 int OpenCLKernel::ReSize() {
-  if (infer_shape_flag_) {
+  if (op_parameter_->infer_flag_) {
    return RET_OK;
  }
  auto ret = InferShape();
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@ -27,6 +27,7 @@
 #include "src/runtime/gpu/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/dequant.h"
 #include "src/runtime/kernel/opencl/utils.h"
+#include "nnacl/resize_parameter.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@ -35,15 +36,15 @@ namespace mindspore::kernel {

 struct OpenCLToFormatParameter {
  OpParameter op_parameter{};
-  schema::Format src_format{schema::Format::Format_NHWC};
-  schema::Format dst_format{schema::Format::Format_NHWC4};
  lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
 };

 template <typename SrcT, typename DstT>
 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
  MS_ASSERT(dst);
-  MS_ASSERT(src);
+  if (src == nullptr || src_num <= 0) {
+    return;
+  }
  auto *N = dst;
  auto *H = dst + 1;
  auto *W = dst + 2;
@ -70,10 +71,12 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
 template <typename SrcT, typename DstT>
 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
  MS_ASSERT(dst);
-  MS_ASSERT(src);
  for (int i = 0; i < 4; ++i) {
    dst[i] = default_value;
  }
+  if (src == nullptr || src_num <= 0) {
+    return;
+  }
  Broadcast2GpuShape(dst, src, src_num);
 }

@ -92,6 +95,10 @@ struct GpuTensorInfo {
    H = shape.s[1];
    W = shape.s[2];
    C = shape.s[3];
+    MS_ASSERT(N > 0);
+    MS_ASSERT(H > 0);
+    MS_ASSERT(W > 0);
+    MS_ASSERT(C > 0);
    Slice = UP_DIV(C, C4NUM);

    FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
@ -167,7 +174,6 @@ class OpenCLKernel : public LiteKernel {
               const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
      : LiteKernel(parameter, inputs, outputs, ctx) {
    ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
-    infer_shape_flag_ = parameter->infer_flag_;
  }
  ~OpenCLKernel() override = default;
  int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
@ -199,8 +205,6 @@ class OpenCLKernel : public LiteKernel {
  int DequantWeight();
  void FreeDequantedWeight();
  virtual int InferShape();
-  bool GetInferShapeFlag() { return infer_shape_flag_; }
-  void SetInferShapeFlag(bool flag) { infer_shape_flag_ = flag; }

 protected:
  static std::set<size_t> GenerateLocalByGlobal(size_t global_i);
@ -225,7 +229,6 @@ class OpenCLKernel : public LiteKernel {
  cl::Event event_;
  void *restore_quant_data_{nullptr};
  bool dequant_flag_{false};
-  bool infer_shape_flag_{false};

 private:
  lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
@ -241,7 +244,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
    free(opParameter);
    return nullptr;
  }
-  if (!reinterpret_cast<kernel::OpenCLKernel *>(kernel)->GetInferShapeFlag()) {
+  if (!opParameter->infer_flag_) {
    MS_LOG(WARNING) << "kernel don't infer shape yet!";
    return kernel;
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@ -121,8 +121,6 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,

  for (size_t i = 0; i < in_tensors.size(); ++i) {
    auto *in_tensor = in_tensors.at(i);
-    auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC;
-    auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4;
    auto *new_tensor = new (std::nothrow)
      lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR);
    MS_ASSERT(new_tensor);
@ -130,20 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
      MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
      return RET_ERROR;
    }
-    if (mem_type == MemType::IMG) {
-      new_tensor->set_format(dst_format);
-      in_tensor->set_format(src_format);
-    } else {
-      new_tensor->set_format(src_format);
-      in_tensor->set_format(dst_format);
-    }

    out_tensors->emplace_back(new_tensor);
    KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
-    if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) {
-      desc.data_type = kNumberTypeFloat16;
-      new_tensor->set_data_type(kNumberTypeFloat16);
-    }
    auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter)));
    MS_ASSERT(parameter);
    if (parameter == nullptr) {
@ -153,16 +140,7 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
      return RET_ERROR;
    }
    parameter->op_parameter.type_ = PRIM_TO_FORMAT;
-    bool output_shape_setted = true;
-    for (auto output : *out_tensors) {
-      if (output->shape().empty() || output->ElementsNum() < 0) {
-        output_shape_setted = false;
-        break;
-      }
-    }
-    parameter->op_parameter.infer_flag_ = output_shape_setted;
-    parameter->src_format = src_format;
-    parameter->dst_format = dst_format;
+    parameter->op_parameter.infer_flag_ = false;
    parameter->out_mem_type = mem_type;
    out_parameters->emplace_back(parameter);
    LiteKernel *in_convert_op = nullptr;
@ -255,8 +233,7 @@ int OpenCLSubGraph::Init() {

 int OpenCLSubGraph::UpdateTensorDataTypePass() {
  bool is_fp16 = ocl_runtime_->GetFp16Enable();
-  MS_ASSERT(in_tensors_[0]);
-  if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
+  if (is_fp16) {
    std::set<lite::Tensor *> out_set;
    out_set.insert(in_tensors_.begin(), in_tensors_.end());
    out_set.insert(out_tensors_.begin(), out_tensors_.end());
@ -330,16 +307,6 @@ void OpenCLSubGraph::GetInOutNodes() {
  }
 }

-bool OpenCLSubGraph::IsSubGraphInferShapeDone() {
-  for (auto node : this->nodes_) {
-    auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node);
-    if (!opencl_kernel->GetInferShapeFlag()) {
-      return false;
-    }
-  }
-  return true;
-}
-
 int OpenCLSubGraph::Prepare() {
  for (const auto tensor : in_tensors_) {
    MS_ASSERT(tensor);
@ -354,7 +321,6 @@ int OpenCLSubGraph::Prepare() {
    MS_LOG(ERROR) << "Create OpenCLExecutor fail";
    return RET_ERROR;
  }
-  auto ret = RET_OK;
  for (auto node : this->nodes_) {
    if (node == nullptr) {
      MS_LOG(ERROR) << "node in Subgraph is nullptr";
@ -363,26 +329,28 @@ int OpenCLSubGraph::Prepare() {
    auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node);
    std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMul, schema::PrimitiveType_BiasAdd};
    if (pre_init_weight_list.find(opencl_kernel->Type()) != pre_init_weight_list.end()) {
-      ret = opencl_kernel->InitWeights();
+      auto ret = opencl_kernel->InitWeights();
      if (ret != RET_OK) {
        MS_LOG(ERROR) << "init weights " << node->name() << " failed";
        return ret;
      }
    }
-    if (opencl_kernel->GetInferShapeFlag()) {
-      ret = node->Prepare();
+    if (opencl_kernel->op_parameter()->infer_flag_) {
+      auto ret = node->Prepare();
      if (ret != RET_OK) {
        MS_LOG(ERROR) << "prepare node " << node->name() << " failed";
        return ret;
      }
    }
  }
-  auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
-  // If tuning_mode is DEFAULT, just malloc memory for reuse.
-  ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
-    return ret;
+  if (all_kernels_infer_done_) {
+    auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
+    // If tuning_mode is DEFAULT, just malloc memory for reuse.
+    auto ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
+      return ret;
+    }
  }
  return RET_OK;
 }
@ -423,7 +391,7 @@ int OpenCLSubGraph::ReSize(bool interrupt) {
    for (auto &output : outputs) {
      output->FreeData();
    }
-    opencl_kernel->SetInferShapeFlag(false);
+    opencl_kernel->op_parameter()->infer_flag_ = false;
  }
  for (auto kernel : nodes_) {
    auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
@ -36,6 +36,9 @@ class OpenCLSubGraph : public SubGraphKernel {
    subgraph_type_ = kGpuSubGraph;
    this->name_ = "GpuSubGraph";
    nodes_set_.insert(nodes.begin(), nodes.end());
+    all_kernels_infer_done_ = std::all_of(nodes_.begin(), nodes_.end(), [](const kernel::LiteKernel *kernel) {
+      return kernel && kernel->op_parameter() && kernel->op_parameter()->infer_flag_;
+    });
  }
  ~OpenCLSubGraph() override;

@ -48,7 +51,6 @@ class OpenCLSubGraph : public SubGraphKernel {
  int Run() override;
  int Run(const KernelCallBack &before, const KernelCallBack &after) override;
  int InsertOpsPass();
-  bool IsSubGraphInferShapeDone();

 private:
  void UnInit();
@ -83,6 +85,7 @@ class OpenCLSubGraph : public SubGraphKernel {
  std::set<LiteKernel *> nodes_set_;
  lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
  lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
+  bool all_kernels_infer_done_ = false;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@ -296,4 +296,27 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens
  return RET_OK;
 }

+static std::set<void *> tmp_weights;
+
+void StoreTmpWeight(lite::Tensor *tensor) {
+  MS_LOG(WARNING) << "store weight when kernel don't infer shape!";
+  if (tensor && tensor->data_c() && tensor->Size()) {
+    void *new_data = malloc(tensor->Size());
+    MS_ASSERT(new_data);
+    if (new_data == nullptr) {
+      return;
+    }
+    memcpy(new_data, tensor->data_c(), tensor->Size());
+    tensor->set_data(new_data);
+    tmp_weights.insert(new_data);
+  }
+}
+
+void FreeTmpWeight(void *data) {
+  if (tmp_weights.count(data)) {
+    free(data);
+    tmp_weights.erase(data);
+  }
+}
+
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@ -63,6 +63,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c
 int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
                         TypeId expect_data_type, const std::vector<int> &expect_shape);

+void StoreTmpWeight(lite::Tensor *tensor);
+void FreeTmpWeight(void *tensor);
+
 template <class T1, class T2>
 void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane_in, int plane_out, int channel,
                      const std::function<T2(T1)> &to_dtype) {
--- a/mindspore/lite/src/tensor.cc
+++ b/mindspore/lite/src/tensor.cc
@ -224,7 +224,7 @@ int32_t Tensor::ElementsC4Num() const {
  if (this->category_ == CONST_SCALAR) {
    return 1;
  }
-  int32_t result = 0;
+  int32_t result = 1;
  if (this->shape_.size() == 4) {
    result = Batch() * Height() * Width() * ((Channel() + 3) / 4 * 4);
  } else if (this->shape_.size() == 2) {