transpose support NCHW2NHWC

fix: GPU init failed, disable winagrad
pull/6394/head
chenzupeng 4 years ago
parent 2db8560a14
commit 1f89ea759b

@ -346,8 +346,12 @@ int LiteSession::Init(Context *context) {
if (context_->device_type_ == DT_GPU) {
auto opencl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
opencl_runtime->SetFp16Enable(context_->float16_priority);
opencl_runtime->Init();
MS_LOG(INFO) << "Init OpenCL runtime.";
if (opencl_runtime->Init() != RET_OK) {
context_->device_type_ = DT_CPU;
MS_LOG(WARNING) << "Init OpenCL runtime failed, change to CPU mode.";
} else {
MS_LOG(INFO) << "Init OpenCL runtime success.";
}
}
#endif
executor = new Executor();

@ -97,6 +97,7 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme
break;
case PrimitiveType_Round:
kernel_name[0] += "_ElementRound";
break;
case PrimitiveType_Neg:
kernel_name[0] += "_ElementNeg";
break;

@ -68,7 +68,7 @@ int ConvolutionOpenCLKernel::Init() {
TILES_X_ = UP_DIV(OW_, 4);
TILES_Y_ = UP_DIV(OH_, 4);
TILES_XY_ = TILES_X_ * TILES_Y_;
use_winograd_ = UseWinograd4x4To6x6() && use_fp16_;
use_winograd_ = UseWinograd4x4To6x6();
// build kernel
if (use_winograd_) {
@ -247,7 +247,7 @@ int ConvolutionOpenCLKernel::InitBuffer() {
int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y;
if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) {
if (out_tensors_[0]->Width() * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
{
im_dst_x = out_tensors_[0]->Width() * CO_SLICES_;
im_dst_y = out_tensors_[0]->Height();
@ -314,7 +314,8 @@ int ConvolutionOpenCLKernel::Run() {
if (use_winograd_) {
ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
ocl_runtime_->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
ocl_runtime_->RunKernel(kernel_conv_, {size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}, {8, 6, 2},
nullptr);
ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
} else {
std::vector<size_t> global, local;
@ -414,7 +415,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
}
if (OW_ * CO_SLICES_ < 65536) {
if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
} else {
code += " WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
@ -616,23 +617,27 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
" FLT4 BtD_row[6] = {0};\n"
" for (int y = 0; y < 6; y++)\n"
" {\n"
" int y_idx = tile_y * 4 - PAD + y;\n";
" int ih = tile_y * 4 - PAD + y;\n";
if (op_format_ == Format_NHWC4) {
code +=
" for (int x = 0; x < 6; x++)\n"
" {\n"
" int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n";
code += " int y_idx = ih;\n";
} else if (op_format_ == Format_NC4HW4) {
code +=
" if(y_idx < 0 || y_idx >= IH)\n"
" {\n"
" continue;\n"
" }\n"
" y_idx += slice * IH;\n"
" for (int x = 0; x < 6; x++)\n"
" {\n"
" int x_idx = tile_x * 4 - PAD + x;\n";
" if(ih < 0 || ih >= IH) {continue;}\n"
" int y_idx = slice * IH + ih;\n";
}
code +=
" for (int x = 0; x < 6; x++)\n"
" {\n"
" int iw = tile_x * 4 - PAD + x;\n";
if (op_format_ == Format_NHWC4) {
code +=
" if(iw < 0 || iw >= IW) {continue;}\n"
" int x_idx = iw * SLICES + slice;\n";
} else if (op_format_ == Format_NC4HW4) {
code += " int x_idx = iw;\n";
}
code +=
@ -792,9 +797,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
if (param->act_type_ == ActType_Relu) {
code += " acc = max(acc, (FLT4)(0.0f));\n";
code += " acc = max(acc, (FLT4)(0.0f));\n\n";
} else if (param->act_type_ == ActType_Relu6) {
code += " acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n";
code += " acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n\n";
}
code +=
@ -838,7 +843,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
}
if (op_format_ == Format_NHWC4) {
if (OW_ * CO_SLICES_ > 65536) {
if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) {
local_w = 4;
}
}

@ -81,8 +81,10 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
bool UseWinograd4x4To6x6() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 &&
param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
IW_ == OW_;
const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
return attr_valid && channel_good && hw_good;

@ -16,7 +16,8 @@ j* you may not use this file except in compliance with the License.
#ifndef MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
#define MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
// Get from Device?
#define MAX_IMAGE2D_SIZE 65535
#include <vector>
#include <map>
#include <memory>
@ -127,7 +128,6 @@ class OpenCLRuntime {
int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);
bool IsInitOK() {return init_done_;}
/**
* Get kernel max worker group size.

@ -6,3 +6,4 @@ mtk_AADB_HADB_MBV2_model_fp32.tflite
hiai_cn_recognize_modify_padv2.tflite
hiai_cv_focusShootOCRModel_08.tflite
hiai_model_normalize_object_scene_ps_20200519.tflite
inception_v3.tflite

@ -44,10 +44,10 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
return;
}
param->num_axes_ = 4;
param->perm_[0] = 0;
param->perm_[1] = 3;
param->perm_[2] = 1;
param->perm_[3] = 2;
param->perm_[0] = shape[3];
param->perm_[1] = shape[4];
param->perm_[2] = shape[5];
param->perm_[3] = shape[6];
auto allocator = ocl_runtime->GetAllocator();
int h = shape[0];
int w = shape[1];
@ -60,9 +60,10 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
MS_LOG(ERROR) << "tensor_x create error.";
return;
}
std::vector<int> out_shape = {1, c, h, w};
std::vector<int> out_shape = {input_shape[param->perm_[0]], input_shape[param->perm_[1]],
input_shape[param->perm_[2]], input_shape[param->perm_[3]]};
auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
out_shape, schema::Format_NCHW);
out_shape, schema::Format_NHWC);
auto tensor_out = tensor_out_ptr.get();
if (tensor_out == nullptr) {
MS_LOG(ERROR) << "tensor_out create error.";
@ -105,25 +106,63 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
lite::opencl::OpenCLRuntime::DeleteInstance();
}
TEST_F(TestTransposeOpenCL, TransposeFp32) {
TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp32) {
int h = 2;
int w = 2;
int c = 3;
std::vector<int> shape = {h, w, c};
int perm0 = 0;
int perm1 = 3;
int perm2 = 1;
int perm3 = 2;
std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
RunTestTranspose(shape, input_data.data(), output_data.data(), false);
}
TEST_F(TestTransposeOpenCL, TransposeFp16) {
TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp16) {
int h = 2;
int w = 2;
int c = 3;
std::vector<int> shape = {h, w, c};
int perm0 = 0;
int perm1 = 3;
int perm2 = 1;
int perm3 = 2;
std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float16_t> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
RunTestTranspose(shape, input_data.data(), output_data.data(), true);
}
TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp32) {
int h = 2;
int w = 2;
int c = 3;
int perm0 = 0;
int perm1 = 2;
int perm2 = 3;
int perm3 = 1;
std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};
RunTestTranspose(shape, input_data.data(), output_data.data(), false);
}
TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp16) {
int h = 2;
int w = 2;
int c = 3;
int perm0 = 0;
int perm1 = 2;
int perm2 = 3;
int perm3 = 1;
std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float16_t> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};
RunTestTranspose(shape, input_data.data(), output_data.data(), true);
}
} // namespace mindspore

Loading…
Cancel
Save