!6088 optimize opencl convolution fp16 kernel

Merge pull request !6088 from 王东旭/opencl_convolution_support_fp16
pull/6088/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 37561b1b4c

@ -310,3 +310,13 @@ __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data,
}
dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}

@ -42,27 +42,35 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
private:
bool use_fp16_ = false;
int CI;
int IH;
int IW;
int CO;
int OH;
int OW;
int CI_SLICES;
int CO_SLICES;
int CI_{};
int IH_{};
int IW_{};
int CO_{};
int OH_{};
int OW_{};
int CI_SLICES_{};
int CO_SLICES_{};
int KH_{};
int KW_{};
void *packed_weight_ = nullptr;
void *packed_bias_ = nullptr;
bool use_winograd_ = false;
int TILES_X;
int TILES_Y;
int TILES_XY;
int TILES_X_{};
int TILES_Y_{};
int TILES_XY_{};
void *winograd_mem0_ = nullptr;
void *winograd_mem1_ = nullptr;
cl::Kernel kernel_4x4to36;
cl::Kernel kernel_conv;
cl::Kernel kernel_36to4x4;
cl::Kernel kernel_4x4to36_;
cl::Kernel kernel_conv_;
cl::Kernel kernel_36to4x4_;
int InitWeight();
int InitBias();
int RearrangeWinogradWeight();
template <typename SRC_T, typename DST_T>
int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);
std::string CodeGenConvolutionNHWC4();
std::string CodeGenConvolutionNC4HW4();
@ -72,16 +80,18 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
std::string CodeGenWinograd36To4x4();
int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }
bool UseWinograd4x4To6x6() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12;
const bool hw_good = TILES_X * TILES_Y >= 16;
const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
return attr_valid && channel_good && hw_good;
}
std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
std::vector<float> C(M * K);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {

@ -28,3 +28,6 @@ cp -fr $TEST_DATA_DIR/testPK ./data
./lite-test --gtest_filter=TestDeconvInt8.*
./lite-test --gtest_filter="TestTfliteParser*"
# for GPU OpenCL
./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*"

@ -21,19 +21,18 @@
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
#include "nnacl/pack.h"
#include "nnacl/fp32/common_func.h"
using mindspore::kernel::ConvolutionOpenCLKernel;
using mindspore::kernel::LiteKernel;
using mindspore::kernel::SubGraphOpenCLKernel;
using mindspore::lite::Tensor;
using mindspore::schema::Format;
using mindspore::schema::Format_KHWC;
using mindspore::schema::Format_NC4HW4;
using mindspore::schema::Format_NCHW;
using mindspore::schema::Format_NHWC;
using mindspore::schema::Format_NHWC4;
using mindspore::schema::NodeType_ValueNode;
using mindspore::schema::Format::Format_KHWC;
using mindspore::schema::Format::Format_NC4HW4;
using mindspore::schema::Format::Format_NCHW;
using mindspore::schema::Format::Format_NHWC;
using mindspore::schema::Format::Format_NHWC4;
namespace mindspore {
@ -41,26 +40,25 @@ class TestConvolutionOpenCL : public mindspore::CommonTest {};
void LoadData(Tensor *tensor, const float *src) {
if (tensor->data_type() == kNumberTypeFloat16) {
auto num = tensor->Size() / 2;
auto tensor_data = reinterpret_cast<uint16_t *>(tensor->MutableData());
auto num = tensor->Size() / sizeof(float16_t);
auto tensor_data = reinterpret_cast<float16_t *>(tensor->data_c());
for (int i = 0; i < num; ++i) {
tensor_data[i] = Float32ToShort(src[i]);
tensor_data[i] = static_cast<float16_t>(src[i]);
}
} else {
memcpy(tensor->MutableData(), src, tensor->Size());
memcpy(tensor->data_c(), src, tensor->Size());
}
}
void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4;
auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4);
std::vector<float> output_data(num);
if (output->data_type() == kNumberTypeFloat16) {
auto output_data_fp16 = reinterpret_cast<uint16_t *>(output->MutableData());
for (int i = 0; i < output_data.size(); ++i) {
output_data[i] = ShortToFloat32((output_data_fp16[i]));
output_data[i] = static_cast<float>(reinterpret_cast<float16_t *>(output->data_c())[i]);
}
} else {
memcpy(output_data.data(), output->MutableData(), output->Size());
memcpy(output_data.data(), output->data_c(), output->Size());
}
printf("output:");
@ -69,9 +67,9 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
}
printf("\n");
float max_err = 0.0f;
float max_err = -1.0f;
std::array<int, 5> idx_5d{};
int idx = -1;
int max_err_idx = -1, first_err_idx = -1;
auto SLICES = UP_DIV(output->Channel(), 4);
int I = 1, J = 1, K = 1, L = 1, M = 1;
switch (output->GetFormat()) {
@ -98,10 +96,13 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
for (int l = 0; l < L; ++l) {
for (int m = 0; m < M; ++m) {
auto err = std::fabs(output_data[cn] - expect_data[cn]);
if (first_err_idx == -1 && max_err > atol) {
first_err_idx = cn;
}
if (err > max_err) {
max_err = err;
idx_5d = {i, j, k, l, m};
idx = cn;
max_err_idx = cn;
}
cn++;
}
@ -110,18 +111,19 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
}
}
float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx]));
if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
} else {
printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
}
printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx],
max_err, relative_err * 100);
if (max_err > atol) {
printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx],
output_data[first_err_idx]);
FAIL();
} else {
float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx]));
if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
} else {
printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
}
printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx],
output_data[max_err_idx], max_err, relative_err * 100);
printf("COMPARE SUCCESS!\n\n");
}
}

Loading…
Cancel
Save