From 937dbf8b03f4172897804bfd3b334227128356ee Mon Sep 17 00:00:00 2001
From: wandongdong <wandongdong1@huawei.com>
Date: Wed, 2 Sep 2020 04:08:33 -0700
Subject: [PATCH] add gpu benchmark

---
 mindspore/lite/src/lite_kernel.h              |  5 ---
 .../src/runtime/kernel/opencl/cl/transpose.cl | 18 +++++++--
 .../kernel/opencl/kernel/activation.cc        |  2 +-
 .../kernel/opencl/kernel/conv2d_transpose.cc  |  2 +-
 .../runtime/kernel/opencl/kernel/matmul.cc    |  2 +-
 .../runtime/kernel/opencl/kernel/reshape.cc   |  3 ++
 mindspore/lite/test/models_tflite_gpu.cfg     |  3 ++
 mindspore/lite/test/run_benchmark_nets.sh     | 37 +++++++++++++++++++
 .../src/runtime/kernel/opencl/matmul_tests.cc |  2 +-
 9 files changed, 61 insertions(+), 13 deletions(-)
 create mode 100644 mindspore/lite/test/models_tflite_gpu.cfg

diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h
index d646135267..713257ac2d 100644
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -27,11 +27,6 @@
 #include "src/ir/tensor.h"
 #include "include/errorcode.h"
 
-#ifdef ENABLE_FP16
-using FLOAT_t = float16_t;
-#else
-using FLOAT_t = float;
-#endif
 
 // using mindspore::kernel::AddressPtr;
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
index 7be8e94b74..4b07a1e89e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
@@ -55,10 +55,20 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d
   result[1] = (FLT4)(0.0f);
   result[2] = (FLT4)(0.0f);
   result[3] = (FLT4)(0.0f);
-  FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
-  FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
-  FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
-  FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
+  bool over_size = W * C.y > 65535;
+  FLT4 x0, x1, x2, x3;
+  if (over_size) {
+    x0 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X));
+    x1 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 1));
+    x2 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 2));
+    x3 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 3));
+  } else {
+    x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
+    x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
+    x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
+    x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
+  }
+
   result[0].x = x0.x;
   result[0].y = x1.x;
   result[0].z = x2.x;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
index d38b43da44..19685cec4f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@@ -44,7 +44,7 @@ void ActivationOpenClKernel::InitBuffer() {
   alpha_buff_ = allocator->MapBuffer(alpha_buff_, CL_MAP_WRITE, nullptr, true);
   memset(alpha_buff_, 0x00, fp_size);
   if (enable_fp16_) {
-    auto fp16 = (float16_t)alpha_;
+    auto fp16 = (int16_t)alpha_;
     memcpy(alpha_buff_, &fp16, fp_size);
   } else {
     memcpy(alpha_buff_, &alpha_, fp_size);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index 27106da1fd..e9385cef48 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -72,7 +72,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
   int div_ci = UP_DIV(ci, C4NUM);
   int div_co = UP_DIV(co, C4NUM);
   auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
-  auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
+  auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
 
   // IHWO to OHWI4(I)4(O)(converter format is IHWO)
   // init padWeight_(buffer mem)
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index 9431e555d5..af81b3747b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -75,7 +75,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }
 void MatMulOpenCLKernel::PadWeight() {
   auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
 
-  size_t dtype_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
+  size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
   padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
   padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
   memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index d3d21a8199..c4124a743c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Reshape;
+using mindspore::schema::PrimitiveType_Squeeze;
 
 namespace mindspore::kernel {
 
@@ -142,4 +143,6 @@ kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector<lite::tensor::T
 
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/test/models_tflite_gpu.cfg b/mindspore/lite/test/models_tflite_gpu.cfg
new file mode 100644
index 0000000000..d432b29fef
--- /dev/null
+++ b/mindspore/lite/test/models_tflite_gpu.cfg
@@ -0,0 +1,3 @@
+mobilenet_v1_1.0_224.tflite
+mobilenet_v2_1.0_224.tflite
+resnet.tflite
\ No newline at end of file
diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh
index abc8c9bf7a..e3697d4b60 100644
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@@ -310,6 +310,42 @@ function Run_arm64() {
         fi
 	#sleep 1
     done < ${models_tflite_awaretraining_config}
+
+    # Run gpu tflite converted models:
+    while read line; do
+        model_name=${line}
+        if [[ $model_name == \#* ]]; then
+          continue
+        fi
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64_gpu: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64_gpu: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+        # run benchmark test without clib data
+        #echo ${model_name}
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64_gpu: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64_gpu: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+	#sleep 1
+    done < ${models_tflite_gpu_config}
 }
 
 # Print start msg before run testcase
@@ -397,6 +433,7 @@ models_tflite_posttraining_config=${basepath}/models_tflite_posttraining.cfg
 models_onnx_config=${basepath}/models_onnx.cfg
 models_fp16_config=${basepath}/models_fp16.cfg
 models_mindspore_config=${basepath}/models_mindspore.cfg
+models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
 Convert_status=0
 
 rm -rf ${basepath}/ms_models
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
index 8b21e5f845..5a6ea2ef89 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@@ -36,7 +36,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
   size_t dtype_size = sizeof(float);
   if (enable_fp16) {
     ocl_runtime->SetFp16Enable(true);
-    dtype_size = sizeof(float16_t);
+    dtype_size = sizeof(int16_t);
   }
   auto allocator = ocl_runtime->GetAllocator();
   int ci = shape[0];