From 60a146f173e4dbf51deab3b52d28f979715ef061 Mon Sep 17 00:00:00 2001 From: Corleone Date: Fri, 11 Sep 2020 16:01:38 +0800 Subject: [PATCH] add FP16 support for arithmetic and scale for GPU --- .../src/runtime/kernel/opencl/cl/scale.cl | 14 +- .../kernel/opencl/kernel/arithmetic.cc | 127 +++++++++++++++--- .../runtime/kernel/opencl/arithmetic_tests.cc | 91 ++++++++----- .../src/runtime/kernel/opencl/scale_tests.cc | 9 +- 4 files changed, 175 insertions(+), 66 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl index 92dc04d45b..d4a8d5ec5e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl @@ -9,9 +9,9 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale return; } - FLT4 in = read_imagef(input, smp_none, (int2)(X, Y)); - FLT4 s = read_imagef(scale, smp_none, (int2)(X, Y)); - FLT4 o = read_imagef(offset, smp_none, (int2)(X, Y)); + FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); + FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); + FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); WRITE_IMAGE(output, (int2)(X, Y), in * s + o); } @@ -23,7 +23,7 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float return; } - FLT4 in = read_imagef(input, smp_none, (int2)(X, Y)); + FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); WRITE_IMAGE(output, (int2)(X, Y), in * (FLT)scale + (FLT)offset); } @@ -35,8 +35,8 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca return; } - FLT4 in = read_imagef(input, smp_none, (int2)(X, Y)); - FLT4 s = read_imagef(scale, smp_none, (int2)(X % C, 0)); - FLT4 o = read_imagef(offset, smp_none, (int2)(X % C, 0)); + FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); + FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); + FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); WRITE_IMAGE(output, (int2)(X, Y), in * s + o); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index b0298b3039..9700ddc3d8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -18,6 +18,7 @@ #include #include #include +#include "nnacl/fp32/common_func.h" #include "schema/model_generated.h" #include "src/kernel_registry.h" #include "src/runtime/kernel/opencl/utils.h" @@ -48,16 +49,20 @@ std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { local_size_ = {16, 16}; - if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t W = out_tensors_[0]->Width(); + global_size_ = {W, H}; + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); size_t W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); global_size_ = {W, H}; - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) { + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { size_t H = out_tensors_[0]->Batch(); size_t W = UP_DIV(out_tensors_[0]->Channel(), C4NUM); global_size_ = {W, H}; } else { - MS_LOG(ERROR) << "Unspport data format " << out_tensors_[0]->GetFormat(); + MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat(); } } @@ -68,21 +73,28 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + im_dst_x = out_tensors_[0]->Width(); + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) { + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { im_dst_y = out_tensors_[0]->Batch(); im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM); } else { - MS_LOG(ERROR) << "Unspport data format " << out_tensors_[0]->GetFormat(); + MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat(); return RET_ERROR; } -#ifdef ENABLE_FP16 - size_t img_dtype = CL_HALF_FLOAT; -#else + size_t img_dtype = CL_FLOAT; -#endif + if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + img_dtype = CL_HALF_FLOAT; + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + img_dtype = CL_FLOAT; + } else { + MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[0]->data_type(); + } img_size->clear(); std::vector vec{im_dst_x, im_dst_y, img_dtype}; *img_size = vec; @@ -93,23 +105,99 @@ int ArithmeticOpenCLKernel::InitBuffer() { const ArithmeticParameter *arithmetic_parameter = reinterpret_cast(op_parameter_); if (!arithmetic_parameter->broadcasting_) { if (in_tensors_[1]->category() == lite::Tensor::Category::CONST && in_tensors_[1]->MutableData() != nullptr) { - auto allocatdor = runtime_->GetAllocator(); + auto allocator = runtime_->GetAllocator(); std::vector img_size; GetImageSize(0, &img_size); - weight_ptr_ = - allocatdor->CreateImageFromHost(in_tensors_[1]->MutableData(), in_tensors_[1]->ElementsNum(), img_size); - return RET_OK; + int pack_weight_size = in_tensors_[1]->ElementsC4Num(); + int plane = in_tensors_[1]->Height() * in_tensors_[1]->Width(); + int channel = in_tensors_[1]->Channel(); + int batch = in_tensors_[1]->Batch(); + + if (in_tensors_[0]->GetFormat() == in_tensors_[1]->GetFormat()) { + if (in_tensors_[0]->data_type() == in_tensors_[1]->data_type()) { + weight_ptr_ = + allocator->CreateImageFromHost(in_tensors_[1]->MutableData(), in_tensors_[1]->ElementsNum(), img_size); + } else { + MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " + << in_tensors_[0]->data_type(); + return RET_ERROR; + } + } else if (in_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + float *weight = new (std::nothrow) float[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + std::function to_dtype = [](float x) -> float { return (float)x; }; + PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); + weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); + delete[] weight; + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + std::function to_dtype = Float32ToShort; + PackNHWCToNC4HW4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); + weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); + delete[] weight; + } else { + MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " + << in_tensors_[0]->data_type(); + return RET_ERROR; + } + } else { + MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to " + << in_tensors_[0]->GetFormat(); + return RET_ERROR; + } + } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + float *weight = new (std::nothrow) float[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + std::function to_dtype = [](float x) -> float { return (float)x; }; + PackNHWCToNHWC4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); + weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); + delete[] weight; + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + std::function to_dtype = Float32ToShort; + PackNHWCToNHWC4(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); + weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); + delete[] weight; + } else { + MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " + << in_tensors_[0]->data_type(); + return RET_ERROR; + } + } else { + MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to " + << in_tensors_[0]->GetFormat(); + return RET_ERROR; + } + } } } return RET_OK; } + int ArithmeticOpenCLKernel::Init() { runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); std::string kernel_name; const ArithmeticParameter *arithmetic_parameter = reinterpret_cast(op_parameter_); - if (arithmetic_parameter->broadcasting_ && in_tensors_[1]->category() == lite::Tensor::Category::CONST && - in_tensors_[1]->MutableData() != nullptr) { + if (arithmetic_parameter->broadcasting_) { element_flag_ = false; kernel_name = "BoardcastArith"; } else { @@ -202,10 +290,13 @@ int ArithmeticOpenCLKernel::Run() { int H = 0; int W = 0; - if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + W = out_tensors_[0]->Width(); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { H = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) { + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { H = out_tensors_[0]->Batch(); W = UP_DIV(out_tensors_[0]->Channel(), C4NUM); } else { diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc index 446a114d5d..3222cfe5d6 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc @@ -20,19 +20,22 @@ namespace mindspore { -void BoardcaseAdd(const float *a, const float b, float *c, const int size) { +template +static void BoardcaseAdd(const T *a, const T b, T *c, const int size) { for (int i = 0; i < size; i++) { c[i] = a[i] + b; } } -void ElementAdd(const float *a, const float *b, float *c, const int size) { +template +static void ElementAdd(const T *a, const T *b, T *c, const int size) { for (int i = 0; i < size; i++) { c[i] = a[i] + b[i]; } } -bool DataCompare(const float *a, const float *b, const int size, const float accuracy = 1e-4) { +template +static bool DataCompare(const T *a, const T *b, const int size, const float accuracy = 1e-4) { for (int i = 0; i < size; i++) { auto diff = fabs(a[i] - b[i]); if (diff > accuracy) { @@ -43,36 +46,40 @@ bool DataCompare(const float *a, const float *b, const int size, const float acc return true; } -void InitData(void *data, const int size) { - float *data_float = reinterpret_cast(data); +template +static void InitData(void *data, const int size) { + T *data_float = reinterpret_cast(data); static unsigned int seed = 123; for (int i = 0; i < size; i++) { data_float[i] = static_cast(rand_r(&seed)) % 100; } } -void LogData(void *data, const int size, const std::string prefix) { +template +static void LogData(void *data, const int size, const std::string prefix) { std::cout << prefix; - float *data_float = reinterpret_cast(data); + T *data_float = reinterpret_cast(data); for (int i = 0; i < size; i++) { std::cout << data_float[i] << ","; } std::cout << std::endl; } -void TestCase(const std::vector &shape_a, const std::vector &shape_b) { +template +static void TestCase(const std::vector &shape_a, const std::vector &shape_b) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); bool is_bias_add = shape_b.empty(); - auto tensorType = lite::TensorCategory(schema::NodeType_ValueNode); - - lite::Tensor *tensor_a = - new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType); - lite::Tensor *tensor_b = - new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape_b, schema::Format_NHWC4, tensorType); - lite::Tensor *tensor_c = - new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType); + auto data_type = kNumberTypeFloat32; + if (sizeof(T) == 2) { + data_type = kNumberTypeFloat16; + ocl_runtime->SetFp16Enable(true); + } + + lite::Tensor *tensor_a = new (std::nothrow) lite::Tensor(data_type, shape_a, schema::Format_NHWC4); + lite::Tensor *tensor_b = new (std::nothrow) lite::Tensor(data_type, shape_b, schema::Format_NHWC4); + lite::Tensor *tensor_c = new (std::nothrow) lite::Tensor(data_type, shape_a, schema::Format_NHWC4); if (tensor_a == nullptr || tensor_b == nullptr || tensor_c == nullptr) { MS_LOG(ERROR) << "Create tensor failed!"; delete tensor_a; @@ -84,10 +91,10 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) int64_t element_num = tensor_a->ElementsC4Num(); int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num(); - float *data_a = new (std::nothrow) float[element_num]; - float *data_b = new (std::nothrow) float[element_num_b]; - float *data_c_cpu = new (std::nothrow) float[element_num]; - float *data_c_ocl = new (std::nothrow) float[element_num]; + T *data_a = new (std::nothrow) T[element_num]; + T *data_b = new (std::nothrow) T[element_num_b]; + T *data_c_cpu = new (std::nothrow) T[element_num]; + T *data_c_ocl = new (std::nothrow) T[element_num]; if (data_a == nullptr || data_b == nullptr || data_c_cpu == nullptr || data_c_ocl == nullptr) { MS_LOG(ERROR) << "Create buffer failed!"; delete tensor_a; @@ -100,12 +107,12 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) return; } - InitData(data_a, element_num); - InitData(data_b, element_num_b); - memset(data_c_ocl, 0, sizeof(float) * element_num); + InitData(data_a, element_num); + InitData(data_b, element_num_b); + memset(data_c_ocl, 0, sizeof(T) * element_num); if (is_bias_add) { - BoardcaseAdd(data_a, static_cast(data_b)[0], data_c_cpu, element_num); + BoardcaseAdd(data_a, static_cast(data_b)[0], data_c_cpu, element_num); } else { ElementAdd(data_a, data_b, data_c_cpu, element_num); } @@ -115,11 +122,12 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) inputs.push_back(tensor_b); } else { tensor_b->MallocData(); - memcpy(tensor_b->MutableData(), data_b, sizeof(float)); + memcpy(tensor_b->MutableData(), data_b, sizeof(T)); } std::vector outputs = {tensor_c}; ArithmeticParameter *param = new (std::nothrow) ArithmeticParameter(); + param->broadcasting_ = is_bias_add; if (param == nullptr) { MS_LOG(ERROR) << "Create parameter failed!"; delete tensor_a; @@ -170,19 +178,19 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) } kernel->Init(); - memcpy(inputs[0]->MutableData(), data_a, sizeof(float) * element_num); + memcpy(inputs[0]->MutableData(), data_a, sizeof(T) * element_num); if (!is_bias_add) { - memcpy(inputs[1]->MutableData(), data_b, sizeof(float) * element_num_b); + memcpy(inputs[1]->MutableData(), data_b, sizeof(T) * element_num_b); } kernel->Run(); - memcpy(data_c_ocl, outputs[0]->MutableData(), sizeof(float) * element_num); + memcpy(data_c_ocl, outputs[0]->MutableData(), sizeof(T) * element_num); - LogData(data_a, 10, "Data A : "); - LogData(data_b, tensor_b->shape().empty() ? 1 : 10, "Data B : "); - LogData(data_c_cpu, 10, "Expect compute : "); - LogData(outputs[0]->MutableData(), 10, "OpenCL compute : "); + LogData(data_a, 10, "Data A : "); + LogData(data_b, tensor_b->shape().empty() ? 1 : 10, "Data B : "); + LogData(data_c_cpu, 10, "Expect compute : "); + LogData(outputs[0]->MutableData(), 10, "OpenCL compute : "); bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num); MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); EXPECT_EQ(true, cmp); @@ -210,16 +218,27 @@ class TestArithmeticOpenCL : public mindspore::CommonTest { TestArithmeticOpenCL() {} }; -TEST_F(TestArithmeticOpenCL, AddElementwiseTest) { +TEST_F(TestArithmeticOpenCL, AddElementwiseFP32) { const std::vector &shape_a = {1, 1024, 1024, 4}; const std::vector &shape_b = {1, 1024, 1024, 4}; - TestCase(shape_a, shape_b); + TestCase(shape_a, shape_b); } -TEST_F(TestArithmeticOpenCL, AddBroadcastTest) { +TEST_F(TestArithmeticOpenCL, AddBroadcastFP32) { const std::vector &shape_a = {1, 128, 128, 4}; const std::vector &shape_b = {}; - TestCase(shape_a, shape_b); + TestCase(shape_a, shape_b); } +TEST_F(TestArithmeticOpenCL, AddElementwiseFP16) { + const std::vector &shape_a = {1, 1024, 1024, 4}; + const std::vector &shape_b = {1, 1024, 1024, 4}; + TestCase(shape_a, shape_b); +} + +TEST_F(TestArithmeticOpenCL, AddBroadcastFP16) { + const std::vector &shape_a = {1, 128, 128, 4}; + const std::vector &shape_b = {}; + TestCase(shape_a, shape_b); +} } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc index bbd8433039..bb3816067a 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc @@ -71,7 +71,6 @@ static void TestCase(const std::vector &shape_a, const std::vector &sh auto allocator = ocl_runtime->GetAllocator(); bool is_broadcast = shape_b.empty(); - auto tensorType = lite::TensorCategory(schema::NodeType_ValueNode); auto format = schema::Format_NHWC4; auto data_type = kNumberTypeFloat32; @@ -79,10 +78,10 @@ static void TestCase(const std::vector &shape_a, const std::vector &sh data_type = kNumberTypeFloat16; ocl_runtime->SetFp16Enable(true); } - lite::Tensor *tensor_in = new (std::nothrow) lite::Tensor(data_type, shape_a, format, tensorType); - lite::Tensor *tensor_scale = new (std::nothrow) lite::Tensor(data_type, shape_b, format, tensorType); - lite::Tensor *tensor_offset = new (std::nothrow) lite::Tensor(data_type, shape_b, format, tensorType); - lite::Tensor *tensor_out = new (std::nothrow) lite::Tensor(data_type, shape_a, format, tensorType); + lite::Tensor *tensor_in = new (std::nothrow) lite::Tensor(data_type, shape_a, format); + lite::Tensor *tensor_scale = new (std::nothrow) lite::Tensor(data_type, shape_b, format); + lite::Tensor *tensor_offset = new (std::nothrow) lite::Tensor(data_type, shape_b, format); + lite::Tensor *tensor_out = new (std::nothrow) lite::Tensor(data_type, shape_a, format); if (tensor_in == nullptr || tensor_scale == nullptr || tensor_offset == nullptr) { MS_LOG(ERROR) << "Create tensor failed!"; delete tensor_in;