!5787 [MSLITE] add cast to opencl to_format op

Merge pull request !5787 from wandongdong/master
5 years ago · f91d80e0c3
parent 6d9501d5ed d922befbe0
commit f91d80e0c3
6 changed files with 217 additions and 44 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@ -1,3 +1,4 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define divide_no_check(a, b) (a / b)
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

@ -62,7 +63,7 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
-  WRITE_IMAGE(output, (int2)(X, Y), weight * a + bias);
+  WRITE_IMAGE(output, (int2)(X, Y), ((FLT)weight) * a + (FLT)bias);
 }

 __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@ -42,10 +42,12 @@ int ToFormatOpenCLKernel::Init() {
                                                   {schema::Format_NC, "NHWC"},       {schema::Format_NHWC4, "NHWC4"}};
  std::string kernel_name =
    "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
+  std::map<TypeId, std::string> dtype_str{
+    {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt8, "Int8"}};
  if (out_mem_type_ == OpenCLMemType::IMG) {
-    kernel_name += "_IMG";
+    kernel_name += "_IMG_" + dtype_str[in_tensors_[0]->data_type()];
  } else {
-    kernel_name += "_BUF";
+    kernel_name += "_BUF_" + dtype_str[out_tensors_[0]->data_type()];
  }

  this->set_name(kernel_name);
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@ -15,6 +15,7 @@
 */

 #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include <set>
 #include "src/runtime/opencl/opencl_executor.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/utils.h"
@ -181,11 +182,31 @@ int SubGraphOpenCLKernel::Init() {
  }
  nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end());

+  UpdateTensorDataType();
+
  MallocTensorWithReuse();

  return RET_OK;
 }

+int SubGraphOpenCLKernel::UpdateTensorDataType() {
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  bool is_fp16 = ocl_runtime->GetFp16Enable();
+  if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
+    std::set<lite::tensor::Tensor *> out_set;
+    out_set.insert(in_tensors_.begin(), in_tensors_.end());
+    out_set.insert(out_tensors_.begin(), out_tensors_.end());
+    for (auto iv : nodes_) {
+      auto cur_outs = iv->out_tensors();
+      for (auto jv : cur_outs) {
+        if (out_set.count(jv) == 0) {
+          jv->set_data_type(kNumberTypeFloat16);
+        }
+      }
+    }
+  }
+  return RET_OK;
+}
 int SubGraphOpenCLKernel::MallocTensorWithReuse() {
  kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
  for (auto *kernel : nodes_) {
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
@ -46,6 +46,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
  int UnInit();

 protected:
+  int UpdateTensorDataType();
  int MallocTensorWithReuse();
  int GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors,
                    const std::vector<std::vector<kernel::LiteKernel *>> in_kernels,
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@ -301,12 +301,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
    // fp16 enable, kernel will use half and read_imageh and write_imageh.
    build_options_str =
      "-DFLT=half -DFLT4=half4 -DFLT16=half16 "
-      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4 ";
+      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half  -DTO_FLT4=convert_half4 ";
  } else {
    // fp16 not enable, kernel will use float and read_imagef and write_imagef.
    build_options_str =
      "-DFLT=float -DFLT4=float4 -DFLT16=float16 "
-      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4 ";
+      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float  -DTO_FLT4=convert_float4 ";
  }

  auto build_options_ext = std::accumulate(