diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl index f02024a1c6..7f79bcc243 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl @@ -54,7 +54,307 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); } -__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, +__kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & as_int4(b))); +} + +__kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | as_int4(b))); +} + +__kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), max(a, b)); +} + +__kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), min(a, b)); +} + +__kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), floor(a / b)); +} + +__kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b); +} + +__kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2)); +} + +__kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == b)); +} + +__kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != b)); +} + +__kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < b)); +} + +__kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= b)); +} + +__kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > b)); +} + +__kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= b)); +} + +__kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b); +} + +__kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b); +} + +__kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b); +} + +__kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b)); +} +__kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & (int4)(b))); +} + +__kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | (int4)b)); +} + +__kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b)); +} + +__kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b)); +} + +__kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), floor(a / (FLT4)b)); +} + +__kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b); +} + +__kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2)); +} + +__kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, const int2 output_shape) { int X = get_global_id(0); int Y = get_global_id(1); @@ -63,7 +363,67 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl } FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); - WRITE_IMAGE(output, (int2)(X, Y), ((FLT)weight) * a + (FLT)bias); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == (FLT4)b)); +} + +__kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != (FLT4)b)); +} + +__kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < (FLT4)b)); +} + +__kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= (FLT4)b)); +} + +__kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > (FLT4)b)); +} + +__kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= (FLT4)b)); } __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, @@ -94,9 +454,26 @@ __kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, _ output[idx] = input_a[idx] * input_b[idx]; } -__kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output, - const unsigned int n) { +__kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] + (FLT)b; +} + +__kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] - (FLT)b; +} + +__kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * (FLT)b; +} + +__kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { int idx = get_global_id(0); if (idx >= n) return; - output[idx] = weight * input_a[idx] + bias; + output[idx] = divide_no_check(input_a[idx], (FLT)b); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 9700ddc3d8..826d9ddc2c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -28,6 +28,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; +using mindspore::schema::PrimitiveType_Eltwise; namespace mindspore::kernel { @@ -130,18 +131,18 @@ int ArithmeticOpenCLKernel::InitBuffer() { MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - std::function to_dtype = [](float x) -> float { return (float)x; }; + std::function to_dtype = [](float x) -> float { return x; }; PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); delete[] weight; } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { - int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; + float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; if (weight == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - std::function to_dtype = Float32ToShort; - PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); + std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; + PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); delete[] weight; } else { @@ -162,18 +163,18 @@ int ArithmeticOpenCLKernel::InitBuffer() { MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - std::function to_dtype = [](float x) -> float { return (float)x; }; + std::function to_dtype = [](float x) -> float { return x; }; PackNHWCToNHWC4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); delete[] weight; } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { - int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; + float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; if (weight == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - std::function to_dtype = Float32ToShort; - PackNHWCToNHWC4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); + std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; + PackNHWCToNHWC4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); delete[] weight; } else { @@ -197,28 +198,69 @@ int ArithmeticOpenCLKernel::Init() { std::string kernel_name; const ArithmeticParameter *arithmetic_parameter = reinterpret_cast(op_parameter_); + if (arithmetic_parameter->broadcasting_) { element_flag_ = false; - kernel_name = "BoardcastArith"; + kernel_name = "Broadcast"; } else { - element_flag_ = true; - switch (op_parameter_->type_) { - case PrimitiveType_Mul: - kernel_name = "ElementMul"; - break; - case PrimitiveType_Add: - kernel_name = "ElementAdd"; - break; - case PrimitiveType_Sub: - kernel_name = "ElementSub"; - break; - case PrimitiveType_Div: - kernel_name = "ElementDiv"; - break; - default: - MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; - break; - } + kernel_name = "Element"; + } + + switch (op_parameter_->type_) { + case PrimitiveType_Mul: + kernel_name += "Mul"; + break; + case PrimitiveType_Add: + kernel_name += "Add"; + break; + case PrimitiveType_Sub: + kernel_name += "Sub"; + break; + case PrimitiveType_Div: + kernel_name += "Div"; + break; + case PrimitiveType_LogicalAnd: + kernel_name += "And"; + break; + case PrimitiveType_LogicalOr: + kernel_name += "Or"; + break; + case PrimitiveType_Maximum: + kernel_name += "Max"; + break; + case PrimitiveType_Minimum: + kernel_name += "Min"; + break; + case PrimitiveType_FloorDiv: + kernel_name += "FloorDiv"; + break; + case PrimitiveType_FloorMod: + kernel_name += "FloorMod"; + break; + case PrimitiveType_SquaredDifference: + kernel_name += "SquaredDifference"; + break; + case PrimitiveType_Equal: + kernel_name += "Equal"; + break; + case PrimitiveType_NotEqual: + kernel_name += "NotEqual"; + break; + case PrimitiveType_Less: + kernel_name += "Less"; + break; + case PrimitiveType_LessEqual: + kernel_name += "LessEqual"; + break; + case PrimitiveType_Greater: + kernel_name += "Greater"; + break; + case PrimitiveType_GreaterEqual: + kernel_name += "GreaterEqual"; + break; + default: + MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; + return RET_ERROR; } lite::STATUS error_code = RET_OK; @@ -265,26 +307,8 @@ int ArithmeticOpenCLKernel::Run() { void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->MutableData() : weight_ptr_; runtime_->SetKernelArg(kernel_, arg_idx++, weight); } else { - float value = static_cast(in_tensors_[1]->MutableData())[0]; - switch (op_parameter_->type_) { - case PrimitiveType_Mul: - weight_ = value; - break; - case PrimitiveType_Add: - bias_ = value; - break; - case PrimitiveType_Sub: - bias_ = -1 * value; - break; - case PrimitiveType_Div: - weight_ = 1 / value; - break; - default: - MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; - break; - } - runtime_->SetKernelArg(kernel_, arg_idx++, weight_); - runtime_->SetKernelArg(kernel_, arg_idx++, bias_); + float weight = static_cast(in_tensors_[1]->MutableData())[0]; + runtime_->SetKernelArg(kernel_, arg_idx++, weight); } runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData()); @@ -345,4 +369,36 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCr REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 0555212e32..1d8fd987d7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -44,8 +44,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { cl::Kernel kernel_; lite::opencl::OpenCLRuntime *runtime_; bool element_flag_{true}; - float weight_{1.f}; - float bias_{.0f}; void *weight_ptr_{nullptr}; std::vector local_size_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index 0f4945d352..bd8724871c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -152,7 +152,7 @@ int ScaleOpenCLKernel::InitBuffer() { delete[] scale; return RET_ERROR; } - std::function to_dtype = [](float x) -> float { return (float)x; }; + std::function to_dtype = [](float x) -> float { return x; }; PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); PackNHWCToNC4HW4(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); @@ -160,20 +160,20 @@ int ScaleOpenCLKernel::InitBuffer() { delete[] scale; delete[] offset; } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { - int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; + float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; if (scale == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; + float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; if (offset == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed!"; delete[] scale; return RET_ERROR; } - std::function to_dtype = Float32ToShort; - PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); - PackNHWCToNC4HW4(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); + std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; + PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); + PackNHWCToNC4HW4(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); delete[] scale; @@ -202,7 +202,7 @@ int ScaleOpenCLKernel::InitBuffer() { delete[] scale; return RET_ERROR; } - std::function to_dtype = [](float x) -> float { return (float)x; }; + std::function to_dtype = [](float x) -> float { return x; }; PackNHWCToNHWC4(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); PackNHWCToNHWC4(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); @@ -210,20 +210,20 @@ int ScaleOpenCLKernel::InitBuffer() { delete[] scale; delete[] offset; } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { - int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; + float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; if (scale == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed!"; return RET_ERROR; } - int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; + float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; if (offset == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed!"; delete[] scale; return RET_ERROR; } - std::function to_dtype = Float32ToShort; - PackNHWCToNHWC4(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); - PackNHWCToNHWC4(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); + std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; + PackNHWCToNHWC4(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); + PackNHWCToNHWC4(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); delete[] scale; @@ -328,8 +328,8 @@ int ScaleOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { - int16_t scale = static_cast(in_tensors_[1]->MutableData())[0]; - int16_t offset = static_cast(in_tensors_[2]->MutableData())[0]; + float16_t scale = static_cast(in_tensors_[1]->MutableData())[0]; + float16_t offset = static_cast(in_tensors_[2]->MutableData())[0]; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); } else { diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 3ebce581fa..2f13dad701 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -300,12 +300,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na if (fp16_enable_) { // fp16 enable, kernel will use half and read_imageh and write_imageh. build_options_str = - "-DFLT=half -DFLT4=half4 -DFLT16=half16 " + "-DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 " "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4 "; } else { // fp16 not enable, kernel will use float and read_imagef and write_imagef. build_options_str = - "-DFLT=float -DFLT4=float4 -DFLT16=float16 " + "-DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 " "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4 "; }