From 5562cd45516b70bd78d8caffe3fda4015af57947 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Wed, 23 Sep 2020 10:40:38 +0800 Subject: [PATCH] fix bug: softmax precision, model random zero --- .../src/runtime/kernel/opencl/cl/softmax.cl | 194 +++++++++--------- .../kernel/opencl/subgraph_opencl_kernel.cc | 2 +- .../src/runtime/opencl/opencl_allocator.cc | 7 +- .../src/runtime/opencl/opencl_allocator.h | 2 +- mindspore/lite/src/tensor.h | 1 + 5 files changed, 106 insertions(+), 100 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl index 44555ee074..1141cb7171 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl @@ -14,9 +14,9 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp if (X >= H || Y >= W) return; - FLT sum = 0.0f; + float sum = 0.0f; for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); sum += exp(t.x); if (d * 4 + 1 < C) sum += exp(t.y); if (d * 4 + 2 < C) sum += exp(t.z); @@ -24,14 +24,15 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp } for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); t = divide_no_check(exp(t), sum); __global FLT *output_flt = (__global FLT *)output; output_flt += (X * W + Y) * C + 4 * d; - output_flt[0] = t.x; - if (d * 4 + 1 < C) output_flt[1] += t.y; - if (d * 4 + 2 < C) output_flt[2] += t.z; - if (d * 4 + 3 < C) output_flt[3] += t.w; + FLT4 result = TO_FLT4(t); + output_flt[0] = result.x; + if (d * 4 + 1 < C) output_flt[1] += result.y; + if (d * 4 + 2 < C) output_flt[2] += result.z; + if (d * 4 + 3 < C) output_flt[3] += result.w; } } @@ -45,9 +46,9 @@ __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2 if (X >= H || Y >= W) return; - FLT sum = 0.0f; + float sum = 0.0f; for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); sum += exp(t.x); if (d * 4 + 1 < C) sum += exp(t.y); if (d * 4 + 2 < C) sum += exp(t.z); @@ -55,9 +56,9 @@ __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2 } for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); t = exp(t) / sum; - WRITE_IMAGE(output, (int2)(Y * S + d, X), t); + WRITE_IMAGE(output, (int2)(Y * S + d, X), TO_FLT4(t)); } } @@ -71,9 +72,9 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out if (X >= H || Y >= W) return; - FLT sum = 0.0f; + float sum = 0.0f; for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); sum += exp(t.x); if (d * 4 + 1 < C) sum += exp(t.y); if (d * 4 + 2 < C) sum += exp(t.z); @@ -81,14 +82,15 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out } for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); t = divide_no_check(exp(t), sum); __global FLT *output_flt = (__global FLT *)output; output_flt += (X * W + Y) * C + 4 * d; - output_flt[0] = t.x; - if (d * 4 + 1 < C) output_flt[1] += t.y; - if (d * 4 + 2 < C) output_flt[2] += t.z; - if (d * 4 + 3 < C) output_flt[3] += t.w; + FLT4 result = TO_FLT4(t); + output_flt[0] = result.x; + if (d * 4 + 1 < C) output_flt[1] += result.y; + if (d * 4 + 2 < C) output_flt[2] += result.z; + if (d * 4 + 3 < C) output_flt[3] += result.w; } } @@ -102,9 +104,9 @@ __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image if (X >= H || Y >= W) return; - FLT sum = 0.0f; + float sum = 0.0f; for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); sum += exp(t.x); if (d * 4 + 1 < C) sum += exp(t.y); if (d * 4 + 2 < C) sum += exp(t.z); @@ -112,51 +114,51 @@ __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image } for (int d = 0; d < S; ++d) { - FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); t = exp(t) / sum; - WRITE_IMAGE(output, (int2)(Y, d * H + X), t); + WRITE_IMAGE(output, (int2)(Y, d * H + X), TO_FLT4(t)); } } __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, const int slices, const int slices_x32) { int tid = get_local_id(0); - FLT sum = 0.0f; + float sum = 0.0f; for (size_t i = tid; i < slices - 1; i += 32) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0)); - sum += dot((FLT4)(1.0f), exp(src)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); + sum += dot((float4)(1.0f), exp(src)); } if ((slices - 1) % 32 == tid) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); - - sum += dot(TO_FLT4(mask), exp(src)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0))); + sum += dot(convert_float4(mask), exp(src)); } - __local FLT4 tmp[8]; - __local FLT *tmpx1 = (__local FLT *)tmp; + __local float4 tmp[8]; + __local float *tmpx1 = (__local float *)tmp; tmpx1[tid] = sum; barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) { - sum = dot((FLT4)(1.0f), tmp[0]); - sum += dot((FLT4)(1.0f), tmp[1]); - sum += dot((FLT4)(1.0f), tmp[2]); - sum += dot((FLT4)(1.0f), tmp[3]); - sum += dot((FLT4)(1.0f), tmp[4]); - sum += dot((FLT4)(1.0f), tmp[5]); - sum += dot((FLT4)(1.0f), tmp[6]); - sum += dot((FLT4)(1.0f), tmp[7]); + sum = dot((float4)(1.0f), tmp[0]); + sum += dot((float4)(1.0f), tmp[1]); + sum += dot((float4)(1.0f), tmp[2]); + sum += dot((float4)(1.0f), tmp[3]); + sum += dot((float4)(1.0f), tmp[4]); + sum += dot((float4)(1.0f), tmp[5]); + sum += dot((float4)(1.0f), tmp[6]); + sum += dot((float4)(1.0f), tmp[7]); tmpx1[0] = divide_no_check(1.0f, sum); } barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; for (size_t i = tid; i < slices - 1; i += 32) { - FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0)); + float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); result = exp(result) * sum; - output[i] = result; + output[i] = TO_FLT4(result); } if ((slices - 1) % 32 == tid) { - FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); - result = exp(result) * sum; + float4 result_float = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0))); + result_float = exp(result_float) * sum; + FLT4 result = TO_FLT4(result_float); __global FLT4 *remain_ptr4 = output; remain_ptr4 += slices - 1; __global FLT *remain_ptr = (__global FLT *)remain_ptr4; @@ -176,80 +178,81 @@ __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *o __kernel void SoftMax1x1_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, const int slices, const int slices_x32) { int tid = get_local_id(0); - FLT sum = 0.0f; + float sum = 0.0f; for (size_t i = tid; i < slices - 1; i += 32) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0)); - sum += dot((FLT4)(1.0f), exp(src)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); + sum += dot((float4)(1.0f), exp(src)); } if ((slices - 1) % 32 == tid) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0))); - sum += dot(TO_FLT4(mask), exp(src)); + sum += dot(convert_float4(mask), exp(src)); } - __local FLT4 tmp[8]; - __local FLT *tmpx1 = (__local FLT *)tmp; + __local float4 tmp[8]; + __local float *tmpx1 = (__local float *)tmp; tmpx1[tid] = sum; barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) { - sum = dot((FLT4)(1.0f), tmp[0]); - sum += dot((FLT4)(1.0f), tmp[1]); - sum += dot((FLT4)(1.0f), tmp[2]); - sum += dot((FLT4)(1.0f), tmp[3]); - sum += dot((FLT4)(1.0f), tmp[4]); - sum += dot((FLT4)(1.0f), tmp[5]); - sum += dot((FLT4)(1.0f), tmp[6]); - sum += dot((FLT4)(1.0f), tmp[7]); + sum = dot((float4)(1.0f), tmp[0]); + sum += dot((float4)(1.0f), tmp[1]); + sum += dot((float4)(1.0f), tmp[2]); + sum += dot((float4)(1.0f), tmp[3]); + sum += dot((float4)(1.0f), tmp[4]); + sum += dot((float4)(1.0f), tmp[5]); + sum += dot((float4)(1.0f), tmp[6]); + sum += dot((float4)(1.0f), tmp[7]); tmpx1[0] = divide_no_check(1.0f, sum); } barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; for (size_t i = tid; i < slices; i += 32) { - FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0)); + float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); result = exp(result) * sum; - WRITE_IMAGE(output, (int2)(i, 0), result); + WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result)); } } __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, const int slices, const int slices_x32) { int tid = get_local_id(0); - FLT sum = 0.0f; + float sum = 0.0f; for (size_t i = tid; i < slices - 1; i += 32) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i)); - sum += dot((FLT4)(1.0f), exp(src)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); + sum += dot((float4)(1.0f), exp(src)); } if ((slices - 1) % 32 == tid) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1))); - sum += dot(TO_FLT4(mask), exp(src)); + sum += dot(convert_float4(mask), exp(src)); } - __local FLT4 tmp[8]; - __local FLT *tmpx1 = (__local FLT *)tmp; + __local float4 tmp[8]; + __local float *tmpx1 = (__local float *)tmp; tmpx1[tid] = sum; barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) { - sum = dot((FLT4)(1.0f), tmp[0]); - sum += dot((FLT4)(1.0f), tmp[1]); - sum += dot((FLT4)(1.0f), tmp[2]); - sum += dot((FLT4)(1.0f), tmp[3]); - sum += dot((FLT4)(1.0f), tmp[4]); - sum += dot((FLT4)(1.0f), tmp[5]); - sum += dot((FLT4)(1.0f), tmp[6]); - sum += dot((FLT4)(1.0f), tmp[7]); + sum = dot((float4)(1.0f), tmp[0]); + sum += dot((float4)(1.0f), tmp[1]); + sum += dot((float4)(1.0f), tmp[2]); + sum += dot((float4)(1.0f), tmp[3]); + sum += dot((float4)(1.0f), tmp[4]); + sum += dot((float4)(1.0f), tmp[5]); + sum += dot((float4)(1.0f), tmp[6]); + sum += dot((float4)(1.0f), tmp[7]); tmpx1[0] = divide_no_check(1.0f, sum); } barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; for (size_t i = tid; i < slices - 1; i += 32) { - FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); + float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); result = exp(result) * sum; - output[i] = result; + output[i] = TO_FLT4(result); } if ((slices - 1) % 32 == tid) { - FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); - result = exp(result) * sum; + float4 result_float = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1))); + result_float = exp(result_float) * sum; + FLT4 result = TO_FLT4(result_float); __global FLT4 *remain_ptr4 = output; remain_ptr4 += slices - 1; __global FLT *remain_ptr = (__global FLT *)remain_ptr4; @@ -269,37 +272,36 @@ __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 * __kernel void SoftMax1x1_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, const int slices, const int slices_x32) { int tid = get_local_id(0); - FLT sum = 0.0f; + float sum = 0.0f; for (size_t i = tid; i < slices - 1; i += 32) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i)); - sum += dot((FLT4)(1.0f), exp(src)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); + sum += dot((float4)(1.0f), exp(src)); } if ((slices - 1) % 32 == tid) { - FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); - - sum += dot(TO_FLT4(mask), exp(src)); + float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1))); + sum += dot(convert_float4(mask), exp(src)); } - __local FLT4 tmp[8]; - __local FLT *tmpx1 = (__local FLT *)tmp; + __local float4 tmp[8]; + __local float *tmpx1 = (__local float *)tmp; tmpx1[tid] = sum; barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) { - sum = dot((FLT4)(1.0f), tmp[0]); - sum += dot((FLT4)(1.0f), tmp[1]); - sum += dot((FLT4)(1.0f), tmp[2]); - sum += dot((FLT4)(1.0f), tmp[3]); - sum += dot((FLT4)(1.0f), tmp[4]); - sum += dot((FLT4)(1.0f), tmp[5]); - sum += dot((FLT4)(1.0f), tmp[6]); - sum += dot((FLT4)(1.0f), tmp[7]); + sum = dot((float4)(1.0f), tmp[0]); + sum += dot((float4)(1.0f), tmp[1]); + sum += dot((float4)(1.0f), tmp[2]); + sum += dot((float4)(1.0f), tmp[3]); + sum += dot((float4)(1.0f), tmp[4]); + sum += dot((float4)(1.0f), tmp[5]); + sum += dot((float4)(1.0f), tmp[6]); + sum += dot((float4)(1.0f), tmp[7]); tmpx1[0] = divide_no_check(1.0f, sum); } barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; for (size_t i = tid; i < slices; i += 32) { - FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); + float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); result = exp(result) * sum; - WRITE_IMAGE(output, (int2)(0, i), result); + WRITE_IMAGE(output, (int2)(0, i), TO_FLT4(result)); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index c5d6857fcc..6d2e4ab367 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -129,7 +129,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!"; delete new_tensor; new_tensor = nullptr; - delete parameter; + free(parameter); parameter = nullptr; return RET_ERROR; } diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index 0391ac3e3c..1e8229421c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -111,8 +111,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size) ocl_runtime_->UnmapBuffer(*mem, host_ptr); if (!img_size.empty()) { cl::ImageFormat image_format(CL_RGBA, img_size[2]); - image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), image_format, *buffer, img_size[0], - img_size[1], img_pitch * dtype_size, &ret); + image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), image_format, *buffer, img_size[0], img_size[1], + img_pitch * dtype_size, &ret); if (image == nullptr || ret != CL_SUCCESS) { delete buffer; UnLock(); @@ -265,6 +265,9 @@ void OpenCLAllocator::Clear() { Lock(); auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) { + if (it->second->map_flags) { + UnmapBuffer(it->second->host_ptr_); + } if (svm_capabilities) { clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_); MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_; diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h index 6beaf92431..978bbf19e7 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h @@ -63,7 +63,7 @@ class OpenCLAllocator : public Allocator { int GetImageSize(void *host_ptr, std::vector *img_size); void *Prepare(void *ptr) override { if (ptr != nullptr) { - ptr = MapBuffer(ptr, CL_MAP_WRITE, nullptr, true); + ptr = MapBuffer(ptr, CL_MAP_READ | CL_MAP_WRITE, nullptr, true); } return ptr; } diff --git a/mindspore/lite/src/tensor.h b/mindspore/lite/src/tensor.h index 0fa96d3c03..0296503e02 100644 --- a/mindspore/lite/src/tensor.h +++ b/mindspore/lite/src/tensor.h @@ -184,6 +184,7 @@ class Tensor : public mindspore::tensor::MSTensor { MS_LOG(WARNING) << "Malloc data failed"; } } + Prepare(); return this->data_; }