!9154 [MS][LITE][GPU]fix bug: matmul run failed when filter is not constant

From: @chenzupeng Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
4 years ago · 039ddc072f
parent 7c67bd24ed 90a8670d2b
commit 039ddc072f
7 changed files with 61 additions and 13 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@ -120,7 +120,10 @@ int Conv2DOpenCLKernel::Prepare() {
    winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
  }

-  InitWeights();
+  auto ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
  SetGlobalLocal();
  SetConstArgs();
  return RET_OK;
@ -256,8 +259,16 @@ int Conv2DOpenCLKernel::InitBias() {
 }

 int Conv2DOpenCLKernel::InitWeights() {
+  if (!in_tensors_.at(1)->IsConst()) {
+    MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
  InitFilter();
  if (has_bias_) {
+    if (!in_tensors_.at(2)->IsConst()) {
+      MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet.";
+      return RET_ERROR;
+    }
    InitBias();
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -52,7 +52,10 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
-  InitWeights();
+  auto ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
  SetGlobalLocal();
  SetConstArgs();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
@ -102,6 +105,10 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
 }

 int Conv2dTransposeOpenCLKernel::InitWeights() {
+  if (!in_tensors_.at(1)->IsConst()) {
+    MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
  ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
  int ci = in_tensors_[0]->shape()[3];
  int co = out_tensors_[0]->shape()[3];
@ -171,6 +178,10 @@ int Conv2dTransposeOpenCLKernel::InitWeights() {
  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
  memset(bias_, 0x00, div_co * C4NUM * data_size);
  if (in_tensors_.size() >= 3) {
+    if (!in_tensors_.at(2)->IsConst()) {
+      MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant bias yet.";
+      return RET_ERROR;
+    }
    auto bias_dtype = in_tensors_[2]->data_type();
    if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
      for (int i = 0; i < co; i++) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -73,7 +73,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
-  InitWeights();
+  auto ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
  SetGlobalLocal();
  SetConstArgs();
  MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_);
@ -81,6 +84,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
 }

 int DepthwiseConv2dOpenCLKernel::InitWeights() {
+  if (!in_tensors_.at(1)->IsConst()) {
+    MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
  auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto allocator = ocl_runtime_->GetAllocator();
  bool is_fp16 = ocl_runtime_->GetFp16Enable();
@ -122,6 +129,10 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
  allocator->UnmapBuffer(packed_weight_);

  if (in_tensors_.size() == kInputSize2) {
+    if (!in_tensors_.at(2)->IsConst()) {
+      MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant bias yet.";
+      return RET_ERROR;
+    }
    size_t dtype_size = sizeof(float);
    if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
      dtype_size = sizeof(int16_t);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@ -81,7 +81,10 @@ int FullConnectionOpenCLKernel::Prepare() {
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
-  InitWeights();
+  auto ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
  SetConstArgs();
  SetGlobalLocal();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
@ -89,6 +92,10 @@ int FullConnectionOpenCLKernel::Prepare() {
 }

 int FullConnectionOpenCLKernel::InitWeights() {
+  if (!in_tensors_.at(kWeightIndex)->IsConst()) {
+    MS_LOG(ERROR) << "FullConnection don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
  auto allocator = ocl_runtime_->GetAllocator();
  int ci = inShape.C;
  int ci4 = UP_DIV(ci, C4NUM);
@ -96,7 +103,6 @@ int FullConnectionOpenCLKernel::InitWeights() {
  int co4 = UP_DIV(co, C4NUM);
  int h = inShape.H;
  int w = inShape.W;
-
  size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  padWeight_ = allocator->Malloc(h * w * ci4 * co4 * C4NUM * C4NUM * dtype_size);
  padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
@ -162,6 +168,10 @@ int FullConnectionOpenCLKernel::InitWeights() {
  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
  memset(bias_, 0x00, co4 * C4NUM * dtype_size);
  if (in_tensors_.size() >= 3) {
+    if (!in_tensors_.at(2)->IsConst()) {
+      MS_LOG(ERROR) << "FullConnection don't support non-constant bias yet.";
+      return RET_ERROR;
+    }
    if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
      for (int i = 0; i < co; i++) {
        reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -64,7 +64,10 @@ int MatMulOpenCLKernel::Prepare() {
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
-  InitWeights();
+  auto ret = InitWeights();
+  if (ret != RET_OK) {
+    return ret;
+  }
  SetConstArgs();
  SetGlobalLocal();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
@ -73,6 +76,10 @@ int MatMulOpenCLKernel::Prepare() {

 int MatMulOpenCLKernel::InitWeights() {
  // ABMCI @ ABCICO = ABMCO
+  if (!in_tensors_.at(kWeightIndex)->IsConst()) {
+    MS_LOG(ERROR) << "Matmul don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
  auto allocator = ocl_runtime_->GetAllocator();
  int ci = inShape[3];
  int ci4 = UP_DIV(ci, C4NUM);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@ -39,8 +39,6 @@ class PoolingOpenCLKernel : public OpenCLKernel {

 private:
  PoolingParameter *parameter_;
-  std::vector<size_t> local_size_;
-  std::vector<size_t> global_size_;
 };

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@ -245,12 +245,12 @@ class OpenCLKernel : public LiteKernel {
      candidate_z = GenerateLocalByGlobal(global_size_[2]);
    }
    for (auto x : candidate_x) {
-      if (x < max_work_items[0]) {
+      if (x <= max_work_items[0]) {
        for (auto y : candidate_y) {
-          if (y < max_work_items[1]) {
+          if (y <= max_work_items[1]) {
            for (auto z : candidate_z) {
              auto group_size = x * y * z;
-              if (z < max_work_items[2] && group_size < max_workgroup_size && group_size > MIN_WORKGROUP_SIZE) {
+              if (z <= max_work_items[2] && group_size <= max_workgroup_size && group_size >= MIN_WORKGROUP_SIZE) {
                BaseTuningParameter tuning_param = BaseTuningParameter();
                tuning_param.local_size = {x, y, z};
                tuning_params.push_back(tuning_param);
@ -341,11 +341,11 @@ class OpenCLKernel : public LiteKernel {
  static std::set<size_t> GenerateLocalByGlobal(size_t global_i) {
    std::set<size_t> local_ = {};
    int index = 1;
-    while (index < global_i) {
+    while (index <= global_i) {
      local_.insert(index);
      index *= 2;
    }
-    for (size_t i = 1; i < 16; i++) {
+    for (size_t i = 1; i <= 16; i++) {
      if (global_i % i == 0) {
        local_.insert(i);
      }