From 2c6cfce70e280ab75ee648c169c5489f0e3c2496 Mon Sep 17 00:00:00 2001
From: wandongdong <wandongdong1@huawei.com>
Date: Tue, 15 Sep 2020 00:33:52 -0700
Subject: [PATCH] fix fp16 bug and add gpu fp16 model to ci

---
 .../runtime/kernel/opencl/kernel/concat.cc    |  1 +
 .../kernel/opencl/kernel/convolution.cc       |  2 +-
 .../kernel/opencl/kernel/depthwise_conv2d.cc  |  9 ++++-
 .../kernel/opencl/subgraph_opencl_kernel.cc   | 13 +++++++
 mindspore/lite/test/models_fp16_gpu.cfg       |  2 +
 mindspore/lite/test/models_tflite_gpu.cfg     |  1 +
 mindspore/lite/test/run_benchmark_nets.sh     | 37 +++++++++++++++++++
 7 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 mindspore/lite/test/models_fp16_gpu.cfg
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 4f44aa4723..cca3a8b6eb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cstring>
+#include <string>
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index 5701970747..4ca21a669e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -69,7 +69,7 @@ int ConvolutionOpenCLKernel::Init() {
   TILES_X_ = UP_DIV(OW_, 4);
   TILES_Y_ = UP_DIV(OH_, 4);
   TILES_XY_ = TILES_X_ * TILES_Y_;
-  use_winograd_ = UseWinograd4x4To6x6();
+  use_winograd_ = UseWinograd4x4To6x6() && use_fp16_;
 
   // build kernel
   if (use_winograd_) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index 811f3f5851..0082243c41 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -134,7 +134,14 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
     size_t up_co_size = C4NUM * CO4 * dtype_size;
     memset(bias_data_, 0, up_co_size);
     auto ori_bias = in_tensors_.at(kBiasIndex)->MutableData();
-    memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
+    if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat32) {
+      float16_t *bias_ptr = static_cast<float16_t*>(bias_data_);
+      for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) {
+        bias_ptr[i] = static_cast<float16_t>(static_cast<float*>(ori_bias)[i]);
+      }
+    } else {
+      memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
+    }
     allocator->UnmapBuffer(bias_data_);
   } else {
     MS_ASSERT(in_tensors_.size() == kInputSize1);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index eb24a5df23..d77797a23e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -56,6 +56,19 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
   }
   for (size_t i = 0; i < in_tensors.size(); ++i) {
     if (in_tensors.at(i)->shape().size() <= 1) {
+      if (mem_type == OpenCLMemType::IMG) {
+        for (auto &iv : in_kernels[i]) {
+          auto tensors = iv->in_tensors();
+          tensors.emplace_back(in_tensors.at(i));
+          iv->set_in_tensors(tensors);
+        }
+      } else {
+        for (auto &iv : in_kernels[i]) {
+          auto tensors = iv->out_tensors();
+          tensors.emplace_back(in_tensors.at(i));
+          iv->set_out_tensors(tensors);
+        }
+      }
       continue;
     }
     OpenCLKernel *cur_opencl_op = reinterpret_cast<OpenCLKernel *>(in_kernels[i][0]);
diff --git a/mindspore/lite/test/models_fp16_gpu.cfg b/mindspore/lite/test/models_fp16_gpu.cfg
new file mode 100644
index 0000000000..7e9823d43e
--- /dev/null
+++ b/mindspore/lite/test/models_fp16_gpu.cfg
@@ -0,0 +1,2 @@
+mobilenet_v1_1.0_224.tflite
+mobilenet_v2_1.0_224.tflite
diff --git a/mindspore/lite/test/models_tflite_gpu.cfg b/mindspore/lite/test/models_tflite_gpu.cfg
index 200f95b60c..7d6d2f31f3 100644
--- a/mindspore/lite/test/models_tflite_gpu.cfg
+++ b/mindspore/lite/test/models_tflite_gpu.cfg
@@ -1,6 +1,7 @@
 mobilenet_v1_1.0_224.tflite
 mobilenet_v2_1.0_224.tflite
 resnet.tflite
+squeezenet.tflite
 mtk_AADB_HADB_MBV2_model_fp32.tflite
 hiai_cn_recognize_modify_padv2.tflite
 hiai_cv_focusShootOCRModel_08.tflite
diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh
index 0c78e3788a..2edfb2bf4f 100644
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@@ -479,6 +479,42 @@ function Run_arm64() {
         fi
     done < ${models_tflite_gpu_config}
 
+    # Run GPU fp16 converted models:
+    while read line; do
+        model_name=${line}
+        if [[ $model_name == \#* ]]; then
+          continue
+        fi
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64_gpu_fp16: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64_gpu_fp16: '${model_name}' failed'
+            echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+        # run benchmark test without clib data
+        echo ${model_name} >> "${run_benchmark_log_file}"
+        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
+        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
+        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
+        if [ $? = 0 ]; then
+            run_result='arm64_gpu_fp16: '${model_name}' pass'
+            echo ${run_result} >> ${run_benchmark_result_file}
+        else
+            run_result='arm64_gpu_fp16: '${model_name}' failed'
+	    echo ${run_result} >> ${run_benchmark_result_file}
+            return 1
+        fi
+	#sleep 1
+    done < ${models_fp16_gpu_config}
+
     # Run mindir converted models:
     while read line; do
         model_name=${line}
@@ -574,6 +610,7 @@ models_onnx_config=${basepath}/models_onnx.cfg
 models_fp16_config=${basepath}/models_fp16.cfg
 models_mindspore_config=${basepath}/models_mindspore.cfg
 models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
+models_fp16_gpu_config=${basepath}/models_fp16_gpu.cfg
 
 ms_models_path=${basepath}/ms_models