From 5562cd45516b70bd78d8caffe3fda4015af57947 Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Wed, 23 Sep 2020 10:40:38 +0800
Subject: [PATCH] fix bug: softmax precision, model random zero

---
 .../src/runtime/kernel/opencl/cl/softmax.cl   | 194 +++++++++---------
 .../kernel/opencl/subgraph_opencl_kernel.cc   |   2 +-
 .../src/runtime/opencl/opencl_allocator.cc    |   7 +-
 .../src/runtime/opencl/opencl_allocator.h     |   2 +-
 mindspore/lite/src/tensor.h                   |   1 +
 5 files changed, 106 insertions(+), 100 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
index 44555ee074..1141cb7171 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
@@ -14,9 +14,9 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp
 
   if (X >= H || Y >= W) return;
 
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)));
     sum += exp(t.x);
     if (d * 4 + 1 < C) sum += exp(t.y);
     if (d * 4 + 2 < C) sum += exp(t.z);
@@ -24,14 +24,15 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp
   }
 
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)));
     t = divide_no_check(exp(t), sum);
     __global FLT *output_flt = (__global FLT *)output;
     output_flt += (X * W + Y) * C + 4 * d;
-    output_flt[0] = t.x;
-    if (d * 4 + 1 < C) output_flt[1] += t.y;
-    if (d * 4 + 2 < C) output_flt[2] += t.z;
-    if (d * 4 + 3 < C) output_flt[3] += t.w;
+    FLT4 result = TO_FLT4(t);
+    output_flt[0] = result.x;
+    if (d * 4 + 1 < C) output_flt[1] += result.y;
+    if (d * 4 + 2 < C) output_flt[2] += result.z;
+    if (d * 4 + 3 < C) output_flt[3] += result.w;
   }
 }
 
@@ -45,9 +46,9 @@ __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2
 
   if (X >= H || Y >= W) return;
 
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)));
     sum += exp(t.x);
     if (d * 4 + 1 < C) sum += exp(t.y);
     if (d * 4 + 2 < C) sum += exp(t.z);
@@ -55,9 +56,9 @@ __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2
   }
 
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)));
     t = exp(t) / sum;
-    WRITE_IMAGE(output, (int2)(Y * S + d, X), t);
+    WRITE_IMAGE(output, (int2)(Y * S + d, X), TO_FLT4(t));
   }
 }
 
@@ -71,9 +72,9 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out
 
   if (X >= H || Y >= W) return;
 
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)));
     sum += exp(t.x);
     if (d * 4 + 1 < C) sum += exp(t.y);
     if (d * 4 + 2 < C) sum += exp(t.z);
@@ -81,14 +82,15 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out
   }
 
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)));
     t = divide_no_check(exp(t), sum);
     __global FLT *output_flt = (__global FLT *)output;
     output_flt += (X * W + Y) * C + 4 * d;
-    output_flt[0] = t.x;
-    if (d * 4 + 1 < C) output_flt[1] += t.y;
-    if (d * 4 + 2 < C) output_flt[2] += t.z;
-    if (d * 4 + 3 < C) output_flt[3] += t.w;
+    FLT4 result = TO_FLT4(t);
+    output_flt[0] = result.x;
+    if (d * 4 + 1 < C) output_flt[1] += result.y;
+    if (d * 4 + 2 < C) output_flt[2] += result.z;
+    if (d * 4 + 3 < C) output_flt[3] += result.w;
   }
 }
 
@@ -102,9 +104,9 @@ __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image
 
   if (X >= H || Y >= W) return;
 
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)));
     sum += exp(t.x);
     if (d * 4 + 1 < C) sum += exp(t.y);
     if (d * 4 + 2 < C) sum += exp(t.z);
@@ -112,51 +114,51 @@ __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image
   }
 
   for (int d = 0; d < S; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)));
     t = exp(t) / sum;
-    WRITE_IMAGE(output, (int2)(Y, d * H + X), t);
+    WRITE_IMAGE(output, (int2)(Y, d * H + X), TO_FLT4(t));
   }
 }
 
 __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask,
                                    const int slices, const int slices_x32) {
   int tid = get_local_id(0);
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (size_t i = tid; i < slices - 1; i += 32) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0));
-    sum += dot((FLT4)(1.0f), exp(src));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
+    sum += dot((float4)(1.0f), exp(src));
   }
   if ((slices - 1) % 32 == tid) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
-
-    sum += dot(TO_FLT4(mask), exp(src));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)));
+    sum += dot(convert_float4(mask), exp(src));
   }
 
-  __local FLT4 tmp[8];
-  __local FLT *tmpx1 = (__local FLT *)tmp;
+  __local float4 tmp[8];
+  __local float *tmpx1 = (__local float *)tmp;
   tmpx1[tid] = sum;
   barrier(CLK_LOCAL_MEM_FENCE);
   if (tid == 0) {
-    sum = dot((FLT4)(1.0f), tmp[0]);
-    sum += dot((FLT4)(1.0f), tmp[1]);
-    sum += dot((FLT4)(1.0f), tmp[2]);
-    sum += dot((FLT4)(1.0f), tmp[3]);
-    sum += dot((FLT4)(1.0f), tmp[4]);
-    sum += dot((FLT4)(1.0f), tmp[5]);
-    sum += dot((FLT4)(1.0f), tmp[6]);
-    sum += dot((FLT4)(1.0f), tmp[7]);
+    sum = dot((float4)(1.0f), tmp[0]);
+    sum += dot((float4)(1.0f), tmp[1]);
+    sum += dot((float4)(1.0f), tmp[2]);
+    sum += dot((float4)(1.0f), tmp[3]);
+    sum += dot((float4)(1.0f), tmp[4]);
+    sum += dot((float4)(1.0f), tmp[5]);
+    sum += dot((float4)(1.0f), tmp[6]);
+    sum += dot((float4)(1.0f), tmp[7]);
     tmpx1[0] = divide_no_check(1.0f, sum);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
   for (size_t i = tid; i < slices - 1; i += 32) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0));
+    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
     result = exp(result) * sum;
-    output[i] = result;
+    output[i] = TO_FLT4(result);
   }
   if ((slices - 1) % 32 == tid) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
-    result = exp(result) * sum;
+    float4 result_float = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)));
+    result_float = exp(result_float) * sum;
+    FLT4 result = TO_FLT4(result_float);
     __global FLT4 *remain_ptr4 = output;
     remain_ptr4 += slices - 1;
     __global FLT *remain_ptr = (__global FLT *)remain_ptr4;
@@ -176,80 +178,81 @@ __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *o
 __kernel void SoftMax1x1_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
                                    const int slices, const int slices_x32) {
   int tid = get_local_id(0);
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (size_t i = tid; i < slices - 1; i += 32) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0));
-    sum += dot((FLT4)(1.0f), exp(src));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
+    sum += dot((float4)(1.0f), exp(src));
   }
   if ((slices - 1) % 32 == tid) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)));
 
-    sum += dot(TO_FLT4(mask), exp(src));
+    sum += dot(convert_float4(mask), exp(src));
   }
 
-  __local FLT4 tmp[8];
-  __local FLT *tmpx1 = (__local FLT *)tmp;
+  __local float4 tmp[8];
+  __local float *tmpx1 = (__local float *)tmp;
   tmpx1[tid] = sum;
   barrier(CLK_LOCAL_MEM_FENCE);
   if (tid == 0) {
-    sum = dot((FLT4)(1.0f), tmp[0]);
-    sum += dot((FLT4)(1.0f), tmp[1]);
-    sum += dot((FLT4)(1.0f), tmp[2]);
-    sum += dot((FLT4)(1.0f), tmp[3]);
-    sum += dot((FLT4)(1.0f), tmp[4]);
-    sum += dot((FLT4)(1.0f), tmp[5]);
-    sum += dot((FLT4)(1.0f), tmp[6]);
-    sum += dot((FLT4)(1.0f), tmp[7]);
+    sum = dot((float4)(1.0f), tmp[0]);
+    sum += dot((float4)(1.0f), tmp[1]);
+    sum += dot((float4)(1.0f), tmp[2]);
+    sum += dot((float4)(1.0f), tmp[3]);
+    sum += dot((float4)(1.0f), tmp[4]);
+    sum += dot((float4)(1.0f), tmp[5]);
+    sum += dot((float4)(1.0f), tmp[6]);
+    sum += dot((float4)(1.0f), tmp[7]);
     tmpx1[0] = divide_no_check(1.0f, sum);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
   for (size_t i = tid; i < slices; i += 32) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0));
+    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
     result = exp(result) * sum;
-    WRITE_IMAGE(output, (int2)(i, 0), result);
+    WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result));
   }
 }
 
 __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask,
                                     const int slices, const int slices_x32) {
   int tid = get_local_id(0);
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (size_t i = tid; i < slices - 1; i += 32) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    sum += dot((FLT4)(1.0f), exp(src));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i)));
+    sum += dot((float4)(1.0f), exp(src));
   }
   if ((slices - 1) % 32 == tid) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)));
 
-    sum += dot(TO_FLT4(mask), exp(src));
+    sum += dot(convert_float4(mask), exp(src));
   }
 
-  __local FLT4 tmp[8];
-  __local FLT *tmpx1 = (__local FLT *)tmp;
+  __local float4 tmp[8];
+  __local float *tmpx1 = (__local float *)tmp;
   tmpx1[tid] = sum;
   barrier(CLK_LOCAL_MEM_FENCE);
   if (tid == 0) {
-    sum = dot((FLT4)(1.0f), tmp[0]);
-    sum += dot((FLT4)(1.0f), tmp[1]);
-    sum += dot((FLT4)(1.0f), tmp[2]);
-    sum += dot((FLT4)(1.0f), tmp[3]);
-    sum += dot((FLT4)(1.0f), tmp[4]);
-    sum += dot((FLT4)(1.0f), tmp[5]);
-    sum += dot((FLT4)(1.0f), tmp[6]);
-    sum += dot((FLT4)(1.0f), tmp[7]);
+    sum = dot((float4)(1.0f), tmp[0]);
+    sum += dot((float4)(1.0f), tmp[1]);
+    sum += dot((float4)(1.0f), tmp[2]);
+    sum += dot((float4)(1.0f), tmp[3]);
+    sum += dot((float4)(1.0f), tmp[4]);
+    sum += dot((float4)(1.0f), tmp[5]);
+    sum += dot((float4)(1.0f), tmp[6]);
+    sum += dot((float4)(1.0f), tmp[7]);
     tmpx1[0] = divide_no_check(1.0f, sum);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
   for (size_t i = tid; i < slices - 1; i += 32) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i)));
     result = exp(result) * sum;
-    output[i] = result;
+    output[i] = TO_FLT4(result);
   }
   if ((slices - 1) % 32 == tid) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1));
-    result = exp(result) * sum;
+    float4 result_float = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)));
+    result_float = exp(result_float) * sum;
+    FLT4 result = TO_FLT4(result_float);
     __global FLT4 *remain_ptr4 = output;
     remain_ptr4 += slices - 1;
     __global FLT *remain_ptr = (__global FLT *)remain_ptr4;
@@ -269,37 +272,36 @@ __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *
 __kernel void SoftMax1x1_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
                                     const int slices, const int slices_x32) {
   int tid = get_local_id(0);
-  FLT sum = 0.0f;
+  float sum = 0.0f;
   for (size_t i = tid; i < slices - 1; i += 32) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    sum += dot((FLT4)(1.0f), exp(src));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i)));
+    sum += dot((float4)(1.0f), exp(src));
   }
   if ((slices - 1) % 32 == tid) {
-    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1));
-
-    sum += dot(TO_FLT4(mask), exp(src));
+    float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)));
+    sum += dot(convert_float4(mask), exp(src));
   }
 
-  __local FLT4 tmp[8];
-  __local FLT *tmpx1 = (__local FLT *)tmp;
+  __local float4 tmp[8];
+  __local float *tmpx1 = (__local float *)tmp;
   tmpx1[tid] = sum;
   barrier(CLK_LOCAL_MEM_FENCE);
   if (tid == 0) {
-    sum = dot((FLT4)(1.0f), tmp[0]);
-    sum += dot((FLT4)(1.0f), tmp[1]);
-    sum += dot((FLT4)(1.0f), tmp[2]);
-    sum += dot((FLT4)(1.0f), tmp[3]);
-    sum += dot((FLT4)(1.0f), tmp[4]);
-    sum += dot((FLT4)(1.0f), tmp[5]);
-    sum += dot((FLT4)(1.0f), tmp[6]);
-    sum += dot((FLT4)(1.0f), tmp[7]);
+    sum = dot((float4)(1.0f), tmp[0]);
+    sum += dot((float4)(1.0f), tmp[1]);
+    sum += dot((float4)(1.0f), tmp[2]);
+    sum += dot((float4)(1.0f), tmp[3]);
+    sum += dot((float4)(1.0f), tmp[4]);
+    sum += dot((float4)(1.0f), tmp[5]);
+    sum += dot((float4)(1.0f), tmp[6]);
+    sum += dot((float4)(1.0f), tmp[7]);
     tmpx1[0] = divide_no_check(1.0f, sum);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   sum = tmpx1[0];
   for (size_t i = tid; i < slices; i += 32) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i)));
     result = exp(result) * sum;
-    WRITE_IMAGE(output, (int2)(0, i), result);
+    WRITE_IMAGE(output, (int2)(0, i), TO_FLT4(result));
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index c5d6857fcc..6d2e4ab367 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -129,7 +129,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
       MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!";
       delete new_tensor;
       new_tensor = nullptr;
-      delete parameter;
+      free(parameter);
       parameter = nullptr;
       return RET_ERROR;
     }
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
index 0391ac3e3c..1e8229421c 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -111,8 +111,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
     ocl_runtime_->UnmapBuffer(*mem, host_ptr);
     if (!img_size.empty()) {
       cl::ImageFormat image_format(CL_RGBA, img_size[2]);
-      image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), image_format, *buffer, img_size[0],
-                                                          img_size[1], img_pitch * dtype_size, &ret);
+      image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), image_format, *buffer, img_size[0], img_size[1],
+                                             img_pitch * dtype_size, &ret);
       if (image == nullptr || ret != CL_SUCCESS) {
         delete buffer;
         UnLock();
@@ -265,6 +265,9 @@ void OpenCLAllocator::Clear() {
   Lock();
   auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
   for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
+    if (it->second->map_flags) {
+      UnmapBuffer(it->second->host_ptr_);
+    }
     if (svm_capabilities) {
       clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_);
       MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_;
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
index 6beaf92431..978bbf19e7 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
@@ -63,7 +63,7 @@ class OpenCLAllocator : public Allocator {
   int GetImageSize(void *host_ptr, std::vector<size_t> *img_size);
   void *Prepare(void *ptr) override {
     if (ptr != nullptr) {
-      ptr = MapBuffer(ptr, CL_MAP_WRITE, nullptr, true);
+      ptr = MapBuffer(ptr, CL_MAP_READ | CL_MAP_WRITE, nullptr, true);
     }
     return ptr;
   }
diff --git a/mindspore/lite/src/tensor.h b/mindspore/lite/src/tensor.h
index 0fa96d3c03..0296503e02 100644
--- a/mindspore/lite/src/tensor.h
+++ b/mindspore/lite/src/tensor.h
@@ -184,6 +184,7 @@ class Tensor : public mindspore::tensor::MSTensor {
         MS_LOG(WARNING) << "Malloc data failed";
       }
     }
+    Prepare();
     return this->data_;
   }