!10405 [MS][LITE][Develop] add new ops named layer_norm for gpu

From: @pengyongrong Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
4 years ago · c243fb92bd
parent 162e33baf0 f8847e427e
commit c243fb92bd
10 changed files with 484 additions and 2 deletions
--- a/cmake/external_libs/opencl.cmake
+++ b/cmake/external_libs/opencl.cmake
@ -1,9 +1,9 @@
 if (ENABLE_GITEE)
-    set(REQ_URL "https://gitee.com/mirrors/flatbuffers/repository/archive/v2020.06.16.tar.gz")
+    set(REQ_URL "https://gitee.com/mirrors/OpenCL-Headers/repository/archive/v2020.06.16.tar.gz")
    set(MD5 "fc7627b5a8a95ecbe3d5df43bc88aa44")
    set(PKG_GIT_TAG "")
    __download_pkg_with_git(OpenCL-Headers ${REQ_URL} ${PKG_GIT_TAG} ${MD5})
-    set(REQ_URL "https://gitee.com/mirrors/flatbuffers/repository/archive/v2.0.12.tar.gz")
+    set(REQ_URL "https://gitee.com/mirrors/OpenCL-CLHPP/repository/archive/v2.0.12.tar.gz")
    set(MD5 "bd00fca8f861b3b65660d719f00a58dd")
    set(PKG_GIT_TAG "")
    __download_pkg_with_git(OpenCL-CLHPP ${REQ_URL} ${PKG_GIT_TAG} ${MD5})
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/layer_norm.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/layer_norm.cl
@ -0,0 +1,103 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 #define C4NUM 4
 __kernel void ComputeMeanVarDim1NHWC4(__read_only image2d_t src_data, __global FLT *mean_, __global FLT *variance_,
                                      int4 in_shape, int normalized_shape_size) {
  int X = get_global_id(0);  // n*h
  int Y = get_global_id(1);  // w
  if (X > in_shape.x * in_shape.y || Y > in_shape.z || in_shape.y == 0) {
    return;
  }
  int n = X / in_shape.y;
  int h = X % in_shape.y;
  int w = Y;
  int ci4 = UP_DIV(in_shape.w, C4NUM);
  int remainder = in_shape.w % C4NUM;
  FLT4 mean_temp = {0.0f, 0.0f, 0.0f, 0.0f};
  FLT4 var_temp = {0.0f, 0.0f, 0.0f, 0.0f};
  FLT mean = 0.0f;
  FLT var = 0.0f;
  // compute mean
  for (int i = 0; i < ci4; ++i) {
    FLT4 result_temp = READ_IMAGE(src_data, smp_none, (int2)(w * ci4 + i, n * in_shape.y + h));
    mean_temp += result_temp;
  }
  mean = (mean_temp.x + mean_temp.y + mean_temp.z + mean_temp.w) / normalized_shape_size;
  mean_temp.x = mean_temp.y = mean_temp.z = mean_temp.w = mean;
  // compute var
  for (int i = 0; i < ci4; ++i) {
    FLT4 result_temp = READ_IMAGE(src_data, smp_none, (int2)(w * ci4 + i, n * in_shape.y + h));
    if ((i + 1) * C4NUM <= in_shape.w) {
      var_temp += (result_temp - mean_temp) * (result_temp - mean_temp);
    } else {
      if (remainder == 1) {
        mean_temp.x = mean;
        mean_temp.y = mean_temp.z = mean_temp.w = 0.0f;
      } else if (remainder == 2) {
        mean_temp.x = mean_temp.y = mean;
        mean_temp.z = mean_temp.w = 0.0f;
      } else {
        mean_temp.x = mean_temp.y = mean_temp.z = mean;
        mean_temp.w = 0.0f;
      }
      var_temp += (result_temp - mean_temp) * (result_temp - mean_temp);
    }
  }
  var = (var_temp.x + var_temp.y + var_temp.z + var_temp.w) / normalized_shape_size;
  // write result to dst
  int postion = (n * in_shape.y + h) * in_shape.z + w;
  mean_[postion] = mean;
  variance_[postion] = var;
 }
 __kernel void LayerNormalization_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
                                       __global FLT *mean_, __global FLT *variance_, __global FLT *gamma_,
                                       __global FLT *beta_, int4 in_shape, float epsilon_, int normalized_dims_,
                                       int elementwise_affine_) {
  int X = get_global_id(0);  // n*h
  int Y = get_global_id(1);  // w
  int Z = get_global_id(2);  // c4
  if (X >= in_shape.x * in_shape.y || Y >= in_shape.z || Z >= in_shape.w || in_shape.y == 0) {
    return;
  }
  int n = X / in_shape.y;
  int h = X % in_shape.y;
  int w = Y;
  int c = Z;
  int ci4 = UP_DIV(in_shape.w, C4NUM);
  int postion_mv = 0;
  int postion_gb = 0;
  if (normalized_dims_ == 1) {
    postion_mv = (n * in_shape.y + h) * in_shape.z + w;
    postion_gb = c * C4NUM;
  } else if (normalized_dims_ == 2) {
    postion_mv = n * in_shape.y + h;
    postion_gb = w * ci4 * C4NUM + c * C4NUM;
  } else if (normalized_dims_ == 3) {
    postion_mv = n;
    postion_gb = (h * in_shape.z + w) * ci4 * C4NUM + c * C4NUM;
  }
  FLT4 result = {0.0f, 0.0f, 0.0f, 0.0f};
  FLT4 result_in = READ_IMAGE(src_data, smp_none, (int2)(w * ci4 + c, n * in_shape.y + h));
  if (elementwise_affine_) {
    result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb] +
               beta_[postion_gb];
    result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 1] +
               beta_[postion_gb + 1];
    result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 2] +
               beta_[postion_gb + 2];
    result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 3] +
               beta_[postion_gb + 3];
  } else {
    result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_));
    result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_));
    result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_));
    result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_));
  }
  WRITE_IMAGE(dst_data, (int2)((w * ci4 + c), (n * in_shape.y + h)), result);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.h
@ -0,0 +1,61 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_LAYER_NORM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_LAYER_NORM_H_
 #include <vector>
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 namespace mindspore::kernel {
 class LayerNormOpenCLKernel : public OpenCLKernel {
 public:
  LayerNormOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                        const std::vector<lite::Tensor *> &outputs)
      : OpenCLKernel(parameter, inputs, outputs) {}
  ~LayerNormOpenCLKernel() override = default;
  int Run() override;
  int Prepare() override;
  int CheckSpecs() override;
  void SetConstArgs() override;
  void SetGlobalLocal() override;
 private:
  int Initweight();
  void GetMeanVar();
 private:
  cl::Kernel kernel_mean_var_;
  cl::NDRange global_mean_var_, local_mean_var_;
  bool use_fp16_enable_{false};
  void *gamma_{nullptr};
  void *mean_{nullptr};
  void *var_{nullptr};
  void *beta_{nullptr};
  cl_int4 in_shape_{};
  int elementwise_affine_;
  int32_t normalized_dims_{1};
  int normalized_shape_size_{1};
  float epsilon_{0.0f};
  cl::Kernel kernel_;
 };
 }  // namespace mindspore::kernel
 #endif
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
@ -104,6 +104,10 @@ void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std
    return;
  }
  // call sub_graph->Init() after construct subgraph like scheduler.cc
  MS_LOG(DEBUG) << "call sub_graph->Init()";
  EXPECT_TRUE(sub_graph->Init() == RET_OK);
  // simulating benchmark:  session_->CompileGraph() -> PrepareKernels() -> OpenCLSubGraph.Prepare()
  MS_LOG(DEBUG) << "call sub_graph->Prepare()";
  EXPECT_TRUE(sub_graph->Prepare() == RET_OK);  // will set Tensor's allocator be OpenCLAllocator
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/layer_norm_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/layer_norm_tests.cc
@ -0,0 +1,62 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "ut/src/runtime/kernel/opencl/common.h"
 #include "nnacl/layer_norm_parameter.h"
 namespace mindspore::lite::opencl::test {
 class TestOpenCL_LayerNorm : public CommonTest {};
 namespace {
 // PrimitiveType_Stack: src/ops/populate/stack_populate.cc
 OpParameter *CreateParameter(float epsilon, int normalized_dims_, std::vector<int> normalizedShape) {
  auto *param = test::CreateParameter<LayerNormParameter>(schema::PrimitiveType_LayerNorm);
  param->elementwise_mode_ = ELEMENTWISE_PER_CHANNEL;
  param->epsilon_ = epsilon;
  param->normalized_dims_ = normalized_dims_;
  for (int i = 0; i < normalizedShape.size() && i < normalized_dims_; ++i) {
    param->normalized_shape_[i] = normalizedShape[i];
  }
  return reinterpret_cast<OpParameter *>(param);
 }
 }  // namespace
 TEST_F(TestOpenCL_LayerNorm, test1) {
  float epsilon = 1e-5;
  int normalized_dims_ = 1;
  std::vector<int> normalizedShape = {5};
  std::vector<int> input_shape = {2, 3, 4, 5};
  std::vector<int> gamma_shape = {1, 1, 1, 5};
  std::vector<int> beta_shape = {1, 1, 1, 5};
  std::vector<int> output_shape = {2, 3, 4, 5};
  size_t input_size, gamma_size, beta_size, output_size;
  std::string inputPpath = "./test_data/layernormfp32_input.bin";
  std::string gammaPpath = "./test_data/gammafp32_input.bin";
  std::string betaPpath = "./test_data/betafp32_input.bin";
  std::string correctOutputPath = "./test_data/layernormfp32_output.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(inputPpath.c_str(), &input_size));
  auto gamma_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(gammaPpath.c_str(), &gamma_size));
  auto beta_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(betaPpath.c_str(), &beta_size));
  auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
  for (auto fp16_enable : {false}) {
    auto *param = CreateParameter(epsilon, normalized_dims_, normalizedShape);
    TestMain(
      {{input_shape, input_data, VAR}, {gamma_shape, gamma_data, CONST_TENSOR}, {beta_shape, beta_data, CONST_TENSOR}},
      {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-6);
  }
 }
 }  // namespace mindspore::lite::opencl::test
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/betafp32_input.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/betafp32_input.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/gammafp32_input.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/gammafp32_input.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/layernormfp32_input.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/layernormfp32_input.bin
@ -0,0 +1,2 @@
 Sv■╬Ba≥╬Ъ9<=∙╒∙©cЪ©Н╨ц?E⌠y©°┤O>	│╬©яб=▐ьз©┼Dж=╘0к>v╕©╟+д>пF?⌠═╬лЁ'>╧К█©М8;▐Ы8©q>╚?#░H©└█юsEg>хА@с?ycю©Я>7▌©┌ф$©cЗ╛©KQ?▓Т1©3╖C╬Т▐╧?╞┌?нЙ©╪c@ыХ╬Ы^р?©oБ?,$2?1e"?]О╝©Ю╗╬╖9v╬Ц?NTюхl	?║д┌>Х2?}▌©Lё┤>Z~┘?ИK©цо
 ╬²│═?╬;╘|╦?╡▀б©▓▒~?≈Шё©~t░>#╓h©≥Ыl?Ш ?─r©=СО╨©{Юq>∙▐?с8╨>}I?мU©X©~сy╬лx╦?P│>Б(©лbЭ╫Ь╓▐©╜хБ©r╖n?cф│╫
q|?хА>0┤©Щ╧┘©Ю|?ТЫ©еo?хp??╫Ф©?Pю╪[·@²©эЖ╝>0╩©2Vt©╣╛ч=ужk>0╗А╬╤hн>╓6╧=XЩ┘╬'яG=щg>°╝^╫zи┴?	ь">8|Б>Ljг╩▀M▓©ш▌I?o>Щ>√z┤>▓/}©с%И╬╨и░©&[▌?
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/layernormfp32_output.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/layer_norm/test1/layernormfp32_output.bin
		`@ -0,0 +1,2 @@`
							`Sv■╬Ba≥╬Ъ9<=∙╒∙©cЪ©Н╨ц?E⌠y©°┤O> │╬©яб=▐ьз©┼Dж=╘0к>v╕©╟+д>пF?⌠═╬лЁ'>╧К█©М8;▐Ы8©q>╚?#░H©└█юsEg>хА@с?ycю©Я>7▌©┌ф$©cЗ╛©KQ?▓Т1©3╖C╬Т▐╧?╞┌?нЙ©╪c@ыХ╬Ы^р?©oБ?,$2?1e"?]О╝©Ю╗╬╖9v╬Ц?NTюхl ?║д┌>Х2?}▌©Lё┤>Z~┘?ИK©цо`
							`╬²│═?╬;╘\|╦?╡▀б©▓▒~?≈Шё©~t░>#╓h©≥Ыl?Ш ?─r©=СО╨©{Юq>∙▐?с8╨>}I?мU©X©~сy╬лx╦?P│>Б(©лbЭ╫Ь╓▐©╜хБ©r╖n?cф│╫ q\|?хА>0┤©Щ╧┘©Ю\|?ТЫ©еo?хp??╫Ф©?Pю╪[·@²©эЖ╝>0╩©2Vt©╣╛ч=ужk>0╗А╬╤hн>╓6╧=XЩ┘╬'яG=щg>°╝^╫zи┴? ь">8\|Б>Ljг╩▀M▓©ш▌I?o>Щ>√z┤>▓/}©с%И╬╨и░©&[▌?`