From cf2b868892bc7d638d8db35497d1462b7cd52fe1 Mon Sep 17 00:00:00 2001
From: Pengyongrong <pengyongrong@huawei.com>
Date: Tue, 18 Aug 2020 20:48:41 -0700
Subject: [PATCH] solve some problems for power ops

---
 .../src/runtime/kernel/opencl/cl/power.cl     | 18 +++------
 .../runtime/kernel/opencl/kernel/conv2d.cc    | 16 ++++----
 .../kernel/opencl/kernel/conv2d_transpose.cc  | 18 ++++-----
 .../kernel/opencl/kernel/fullconnection.cc    | 10 ++---
 .../kernel/opencl/kernel/layer_norm.cc        | 38 +++++++++---------
 .../runtime/kernel/opencl/kernel/matmul.cc    |  8 ++--
 .../src/runtime/kernel/opencl/kernel/power.cc | 40 +------------------
 .../src/runtime/kernel/opencl/kernel/power.h  |  4 --
 .../src/runtime/kernel/opencl/power_tests.cc  | 16 ++++----
 9 files changed, 60 insertions(+), 108 deletions(-)
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl
index 6d1e32566e..44691bcfd4 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl
@@ -24,27 +24,19 @@ FLT OptimizedPowerImpl(FLT x, int exponent) {
   return exponent >= 0 ? result : 1 / result;
 }
 
-__kernel void power(__read_only image2d_t input0, __global FLT *input1, __write_only image2d_t output,
+__kernel void power(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
                     int4 output_shape, FLT4 parameter) {
   CHECK_IDX;
   int n = X / output_shape.y;
   int h = X % output_shape.y;
-  int unalign_w = (int)parameter.w;
   FLT4 result;
   FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (n * output_shape.y + h)));
-  int index_weight = (n * output_shape.y + h) * output_shape.z * unalign_w + Y * unalign_w + Z * C4NUM;
+  FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*output_shape.w + Z, (n * output_shape.y + h)));
+
   FLT tmp_result[4];
   FLT tmp_result0[4] = {result0.x, result0.y, result0.z, result0.w};
-  FLT tmp_result1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-  if ((Z + 1) * C4NUM <= unalign_w) {
-    for (int i = 0; i < C4NUM; ++i) {
-      tmp_result1[i] = input1[index_weight + i];
-    }
-  } else {
-    for (int i = 0; i < unalign_w % C4NUM; ++i) {
-      tmp_result1[i] = input1[index_weight + i];
-    }
-  }
+  FLT tmp_result1[4] = {result1.x, result1.y, result1.z, result1.w};
+
   for (int i = 0; i < 4; ++i) {
     tmp_result0[i] = tmp_result0[i] * parameter.z + parameter.y;
     if (floor(tmp_result1[i]) == tmp_result1[i]) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
index 3cf836ad2a..ca474e73e9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -65,6 +65,14 @@ int Conv2DOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D.";
     return RET_ERROR;
   }
+  if (!in_tensors_.at(1)->IsConst()) {
+    MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) {
+    MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet.";
+    return RET_ERROR;
+  }
   // for fusion: ActivationType_LEAKY_RELU ActivationType_TANH
   switch (static_cast<int>(param_->act_type_)) {
     case ActType_No:
@@ -302,16 +310,8 @@ int Conv2DOpenCLKernel::InitBias() {
 }
 
 int Conv2DOpenCLKernel::InitWeights() {
-  if (!in_tensors_.at(1)->IsConst()) {
-    MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet.";
-    return RET_ERROR;
-  }
   InitFilter();
   if (has_bias_) {
-    if (!in_tensors_.at(2)->IsConst()) {
-      MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet.";
-      return RET_ERROR;
-    }
     InitBias();
   }
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index 6cd6af9fda..a3a0ee01bf 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -49,6 +49,14 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
     return RET_ERROR;
   }
+  if (!in_tensors_.at(1)->IsConst()) {
+    MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) {
+    MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant bias yet.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -117,10 +125,6 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
 }
 
 int Conv2dTransposeOpenCLKernel::InitWeights() {
-  if (!in_tensors_.at(1)->IsConst()) {
-    MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant filter yet.";
-    return RET_ERROR;
-  }
   ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_);
   int ci = in_tensors_[0]->shape()[3];
   int co = out_tensors_[0]->shape()[3];
@@ -189,11 +193,7 @@ int Conv2dTransposeOpenCLKernel::InitWeights() {
   bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * data_size, img_size);
   bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
   memset(bias_, 0x00, div_co * C4NUM * data_size);
-  if (in_tensors_.size() >= 3) {
-    if (!in_tensors_.at(2)->IsConst()) {
-      MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant bias yet.";
-      return RET_ERROR;
-    }
+  if (in_tensors_.size() == 3) {
     auto bias_dtype = in_tensors_[2]->data_type();
     if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
       for (int i = 0; i < co; i++) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
index 54e3ffa6b8..10662dce8c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@@ -92,6 +92,10 @@ int FullConnectionOpenCLKernel::CheckSpecs() {
       return RET_ERROR;
     }
   }
+  if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) {
+    MS_LOG(ERROR) << "FullConnection don't support non-constant bias yet.";
+    return RET_ERROR;
+  }
   CI_remainder_ = input_nhw / N_;
   return RET_OK;
 }
@@ -211,11 +215,7 @@ int FullConnectionOpenCLKernel::InitBias() {
   bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size);
   bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
   memset(bias_, 0x00, co4 * C4NUM * dtype_size);
-  if (in_tensors_.size() >= 3) {
-    if (!in_tensors_.at(2)->IsConst()) {
-      MS_LOG(ERROR) << "FullConnection don't support non-constant bias yet.";
-      return RET_ERROR;
-    }
+  if (in_tensors_.size() == 3) {
     if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
       for (int i = 0; i < CO_; i++) {
         reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
index b98e24ff77..e6fc9e9b5a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
@@ -33,24 +33,24 @@ namespace mindspore::kernel {
 
 int LayerNormOpenCLKernel::CheckSpecs() {
   auto param = reinterpret_cast<LayerNormParameter *>(this->op_parameter_);
-  if (param->elementwise_mode_ == ELEMENTWISE_PER_CHANNEL) {
-    if (in_tensors_.size() != 3) {
-      MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl;
-      return RET_ERROR;
-    }
-    if (param->normalized_dims_ > in_tensors_.at(0)->shape().size()) {
-      MS_LOG(ERROR) << " invalid normalized_shape_ size" << param->normalized_dims_ << std::endl;
-      return RET_ERROR;
-    }
-  } else if (param->elementwise_mode_ == ELEMENTWISE_NOT) {
-    if (in_tensors_.size() != 1) {
-      MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl;
-      return RET_ERROR;
-    }
-  } else {
-    MS_LOG(ERROR) << "Unsupported elementwise_mode_" << param->elementwise_mode_;
-    return RET_ERROR;
-  }
+  //  if (param->elementwise_mode_ == ELEMENTWISE_PER_CHANNEL) {
+  //    if (in_tensors_.size() != 3) {
+  //      MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl;
+  //      return RET_ERROR;
+  //    }
+  //    if (param->normalized_dims_ > in_tensors_.at(0)->shape().size()) {
+  //      MS_LOG(ERROR) << " invalid normalized_shape_ size" << param->normalized_dims_ << std::endl;
+  //      return RET_ERROR;
+  //    }
+  //  } else if (param->elementwise_mode_ == ELEMENTWISE_NOT) {
+  //    if (in_tensors_.size() != 1) {
+  //      MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl;
+  //      return RET_ERROR;
+  //    }
+  //  } else {
+  //    MS_LOG(ERROR) << "Unsupported elementwise_mode_" << param->elementwise_mode_;
+  //    return RET_ERROR;
+  //  }
   if (in_tensors_.at(0)->shape().size() != 4 || out_tensors_.size() != 1) {
     MS_LOG(ERROR) << "UnSupported in_tensors_.shape.size: " << in_tensors_.at(0)->shape().size()
                   << " out_tensors_.size(): " << out_tensors_.size();
@@ -184,7 +184,7 @@ int LayerNormOpenCLKernel::Initweight() {
 int LayerNormOpenCLKernel::Prepare() {
   use_fp16_enable_ = ocl_runtime_->GetFp16Enable();
   auto param = reinterpret_cast<LayerNormParameter *>(this->op_parameter_);
-  elementwise_affine_ = param->elementwise_mode_;
+  elementwise_affine_ = true;  // param->elementwise_mode_;
   normalized_dims_ = param->normalized_dims_;
   epsilon_ = param->epsilon_;
   if (elementwise_affine_) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index 9f10aabb7a..65623e513a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -48,6 +48,10 @@ int MatMulOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "matmul only support input shape size= 2, 3 or 4.";
     return mindspore::lite::RET_ERROR;
   }
+  if (!in_tensors_.at(kWeightIndex)->IsConst()) {
+    MS_LOG(ERROR) << "Matmul don't support non-constant filter yet.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -80,10 +84,6 @@ int MatMulOpenCLKernel::Prepare() {
 
 int MatMulOpenCLKernel::InitWeights() {
   // ABMCI @ ABCICO = ABMCO
-  if (!in_tensors_.at(kWeightIndex)->IsConst()) {
-    MS_LOG(ERROR) << "Matmul don't support non-constant filter yet.";
-    return RET_ERROR;
-  }
   auto ret = DequantWeight();
   if (ret != RET_OK) {
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
index 2bc1c9afc7..a8e800682b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc
@@ -48,40 +48,6 @@ int PowerOpenCLKernel::CheckSpecs() {
   return RET_OK;
 }
 
-int PowerOpenCLKernel::Initweight() {
-  auto allocator = ocl_runtime_->GetAllocator();
-  GpuTensorInfo img_info(in_tensors_.at(1));
-  auto weight_tensor = in_tensors_.at(1);
-  size_t weight_size = img_info.OriginSize;
-  weight_ = allocator->Malloc(weight_size);
-  allocator->MapBuffer(weight_, CL_MAP_WRITE, nullptr, true);
-  memset(weight_, 0x00, weight_size);
-
-  if (weight_tensor->data_type() == kNumberTypeFloat16) {
-    if (use_fp16_enable_) {
-      memcpy(weight_, weight_tensor->data_c(), weight_size);
-    } else {
-      auto weight_fp32 = reinterpret_cast<float *>(weight_);
-      auto origin_bias_fp16 = reinterpret_cast<float16_t *>(weight_tensor->data_c());
-      for (int i = 0; i < img_info.ElementsNum; ++i) {
-        weight_fp32[i] = static_cast<float>(origin_bias_fp16[i]);
-      }
-    }
-  } else {
-    if (use_fp16_enable_) {
-      auto weight_fp16 = reinterpret_cast<float16_t *>(weight_);
-      auto origin_bias_fp32 = reinterpret_cast<float *>(weight_tensor->data_c());
-      for (int i = 0; i < img_info.ElementsNum; ++i) {
-        weight_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
-      }
-    } else {
-      memcpy(weight_, weight_tensor->data_c(), weight_size);
-    }
-  }
-  allocator->UnmapBuffer(weight_);
-  return RET_OK;
-}
-
 void PowerGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
   const int max_divider = 8;
   const int max_x = 2, max_y = 8;
@@ -145,11 +111,9 @@ int PowerOpenCLKernel::Prepare() {
   std::string kernel_name = "power";
   std::string source = power_source;
   std::string program_name = "power";
-  if (broadcast_ && in_tensors_.size() == 1) {
+  if (broadcast_) {
     power_ = param->power_;
     kernel_name += "_broadcast";
-  } else {
-    Initweight();
   }
   scale_ = param->scale_;
   shift_ = param->shift_;
@@ -168,7 +132,7 @@ int PowerOpenCLKernel::Run() {
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
   } else {
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_, lite::opencl::MemType::BUF);
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(1)->data_c());
   }
   ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c());
   ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h
index bd86acedd5..04b2a7318a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h
@@ -37,14 +37,10 @@ class PowerOpenCLKernel : public OpenCLKernel {
   void SetGlobalLocal() override;
   int Run() override;
 
- private:
-  int Initweight();
-
  private:
   cl_int4 out_shape_{};
   bool broadcast_{false};
   bool use_fp16_enable_{false};
-  void *weight_{nullptr};
   float power_{1.0};
   float scale_{0.0};
   float shift_{1.0};
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
index 1d11eb0273..e4b55a67b3 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
@@ -48,8 +48,8 @@ TEST_F(TestPowerOpenCLCI, Int32CI) {
                          100.0, 121.0, 1728.0, 1.0, 196.0, 225.0, 16.0, 289.0};
   for (auto fp16_enable : {false, true}) {
     auto *param = CreateParameter(broadcast_, shift_, scale_);
-    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
-             param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-3 : 1e-9);
   }
 }
 
@@ -68,8 +68,8 @@ TEST_F(TestPowerOpenCLCI, Fp32CI) {
                          3.20657016, 0.64395994, 0.01526405, 0.13275899, 5.85509388, 0.16177453, 0.07150001, 0.0542811};
   for (auto fp16_enable : {false, true}) {
     auto *param = CreateParameter(broadcast_, shift_, scale_);
-    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
-             param, fp16_enable, fp16_enable ? 1e-2 : 1e-6);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-2 : 1e-6);
   }
 }
 
@@ -87,8 +87,8 @@ TEST_F(TestPowerOpenCLCI, Fp32UnAlign) {
                          3.20657016, 0.64395994, 0.01526405, 0.13275899, 5.85509388, 0.16177453, 0.07150001};
   for (auto fp16_enable : {false, true}) {
     auto *param = CreateParameter(broadcast_, shift_, scale_);
-    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
-             param, fp16_enable, fp16_enable ? 1e-2 : 1e-6);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-2 : 1e-6);
   }
 }
 
@@ -121,8 +121,8 @@ TEST_F(TestPowerOpenCLCI, Fp16CI) {
                          0.4856,  1.014,   0.2025,  -1.736, 0.2134, 0.489, -0.596, 0.7466};
   for (auto fp16_enable : {true}) {
     auto *param = CreateParameter(broadcast_, shift_, scale_);
-    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data},
-             param, fp16_enable, fp16_enable ? 1e-3 : 1e-6);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-3 : 1e-6);
   }
 }